{"id":"https://openalex.org/W4407097719","doi":"https://doi.org/10.1109/lsp.2025.3537949","title":"Text-to-Speech With Lip Synchronization Based on Speech-Assisted Text-to-Video Alignment and Masked Unit Prediction","display_name":"Text-to-Speech With Lip Synchronization Based on Speech-Assisted Text-to-Video Alignment and Masked Unit Prediction","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4407097719","doi":"https://doi.org/10.1109/lsp.2025.3537949"},"language":"en","primary_location":{"id":"doi:10.1109/lsp.2025.3537949","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2025.3537949","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5004566479","display_name":"Youngdo Ahn","orcid":"https://orcid.org/0000-0001-7579-662X"},"institutions":[{"id":"https://openalex.org/I39534123","display_name":"Gwangju Institute of Science and Technology","ror":"https://ror.org/024kbgz78","country_code":"KR","type":"education","lineage":["https://openalex.org/I39534123"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Youngdo Ahn","raw_affiliation_strings":["School of Electrical Engineering and Computer Science, Gwangju Institute of Science and Technology, Gwangju, South Korea"],"raw_orcid":"https://orcid.org/0000-0001-7579-662X","affiliations":[{"raw_affiliation_string":"School of Electrical Engineering and Computer Science, Gwangju Institute of Science and Technology, Gwangju, South Korea","institution_ids":["https://openalex.org/I39534123"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011281298","display_name":"Jong-Wook Chae","orcid":null},"institutions":[{"id":"https://openalex.org/I39534123","display_name":"Gwangju Institute of Science and Technology","ror":"https://ror.org/024kbgz78","country_code":"KR","type":"education","lineage":["https://openalex.org/I39534123"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jongwook Chae","raw_affiliation_strings":["School of Electrical Engineering and Computer Science, Gwangju Institute of Science and Technology, Gwangju, South Korea"],"raw_orcid":"https://orcid.org/0000-0001-8900-251X","affiliations":[{"raw_affiliation_string":"School of Electrical Engineering and Computer Science, Gwangju Institute of Science and Technology, Gwangju, South Korea","institution_ids":["https://openalex.org/I39534123"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5054155133","display_name":"Jong Won Shin","orcid":"https://orcid.org/0000-0002-8910-0264"},"institutions":[{"id":"https://openalex.org/I39534123","display_name":"Gwangju Institute of Science and Technology","ror":"https://ror.org/024kbgz78","country_code":"KR","type":"education","lineage":["https://openalex.org/I39534123"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jong Won Shin","raw_affiliation_strings":["School of Electrical Engineering and Computer Science, Gwangju Institute of Science and Technology, Gwangju, South Korea"],"raw_orcid":"https://orcid.org/0000-0002-8910-0264","affiliations":[{"raw_affiliation_string":"School of Electrical Engineering and Computer Science, Gwangju Institute of Science and Technology, Gwangju, South Korea","institution_ids":["https://openalex.org/I39534123"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5004566479"],"corresponding_institution_ids":["https://openalex.org/I39534123"],"apc_list":null,"apc_paid":null,"fwci":5.1013,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.94363385,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":"32","issue":null,"first_page":"961","last_page":"965"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9958999752998352,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.800691545009613},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7394415140151978},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.6010969877243042},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5289487838745117},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.4910609722137451},{"id":"https://openalex.org/keywords/unit","display_name":"Unit (ring theory)","score":0.4385656416416168},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.4260382354259491},{"id":"https://openalex.org/keywords/linear-predictive-coding","display_name":"Linear predictive coding","score":0.42577287554740906},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.390198290348053},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.09223160147666931},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.08922624588012695}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.800691545009613},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7394415140151978},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.6010969877243042},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5289487838745117},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.4910609722137451},{"id":"https://openalex.org/C122637931","wikidata":"https://www.wikidata.org/wiki/Q118084","display_name":"Unit (ring theory)","level":2,"score":0.4385656416416168},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.4260382354259491},{"id":"https://openalex.org/C59883199","wikidata":"https://www.wikidata.org/wiki/Q1826438","display_name":"Linear predictive coding","level":3,"score":0.42577287554740906},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.390198290348053},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.09223160147666931},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.08922624588012695},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.0},{"id":"https://openalex.org/C145420912","wikidata":"https://www.wikidata.org/wiki/Q853077","display_name":"Mathematics education","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lsp.2025.3537949","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2025.3537949","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.4099999964237213,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":48,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2150593711","https://openalex.org/W2604379605","https://openalex.org/W2808631503","https://openalex.org/W2891205112","https://openalex.org/W2896457183","https://openalex.org/W2963609956","https://openalex.org/W2964243274","https://openalex.org/W3015338123","https://openalex.org/W3015826515","https://openalex.org/W3015841875","https://openalex.org/W3033411150","https://openalex.org/W3035626590","https://openalex.org/W3081492798","https://openalex.org/W3095545636","https://openalex.org/W3098557217","https://openalex.org/W3101631197","https://openalex.org/W3140429000","https://openalex.org/W3168662520","https://openalex.org/W3196103482","https://openalex.org/W3204420730","https://openalex.org/W3209059054","https://openalex.org/W4372348103","https://openalex.org/W4375869364","https://openalex.org/W4382202703","https://openalex.org/W4385245566","https://openalex.org/W4385764360","https://openalex.org/W4385822362","https://openalex.org/W4385823117","https://openalex.org/W4385823163","https://openalex.org/W4392114301","https://openalex.org/W4392902644","https://openalex.org/W4392902784","https://openalex.org/W4394671563","https://openalex.org/W6604828220","https://openalex.org/W6754420807","https://openalex.org/W6755207826","https://openalex.org/W6763832098","https://openalex.org/W6777694618","https://openalex.org/W6778823374","https://openalex.org/W6780218876","https://openalex.org/W6790356757","https://openalex.org/W6796464841","https://openalex.org/W6803063772","https://openalex.org/W6810168380","https://openalex.org/W6810940779","https://openalex.org/W6840815571","https://openalex.org/W6845479124"],"related_works":["https://openalex.org/W193702574","https://openalex.org/W1842536210","https://openalex.org/W1524172635","https://openalex.org/W1796893744","https://openalex.org/W1523214805","https://openalex.org/W4294771049","https://openalex.org/W2559040841","https://openalex.org/W2341426843","https://openalex.org/W596245619","https://openalex.org/W2131711534"],"abstract_inverted_index":{"Text-to-speech":[0],"(TTS)":[1],"with":[2,15,46,143],"lip":[3,17,148],"synchronization":[4,213],"(TTSLS)":[5],"is":[6,80,85],"the":[7,16,23,27,36,68,75,82,114,120,132,159,163,168,172,175,181,185],"task":[8],"of":[9,77,174],"generating":[10],"a":[11,20,47,56,100,125,137,152,189],"speech":[12,116,215],"signal":[13,117],"synchronized":[14],"movements":[18],"in":[19,119,162,210],"video":[21,28,40,62,144,194],"given":[22],"text":[24],"transcription":[25],"and":[26,39,108,214],"without":[29,146],"speech.":[30],"Previous":[31],"approaches":[32],"to":[33,54,61],"TTSLS":[34,101,208],"aligned":[35],"phoneme":[37,57,79],"sequence":[38],"frames":[41,63,145],"using":[42,131,188],"scaled":[43],"dot-product":[44],"attention":[45],"diagonal":[48,69],"constraint":[49,70],"loss,":[50],"which":[51,84],"was":[52],"employed":[53],"prevent":[55],"from":[58,171],"being":[59],"assigned":[60],"too":[64],"far":[65],"away.":[66],"However,":[67],"loss":[71,126,156],"basically":[72],"assumes":[73],"that":[74,158,202],"duration":[76],"each":[78],"about":[81],"same,":[83],"not":[86],"always":[87],"valid":[88],"as":[89],"speaking":[90],"styles":[91],"can":[92,166],"be":[93],"different.":[94],"In":[95,177],"this":[96],"letter,":[97],"we":[98,123,150,179],"propose":[99],"system":[102,165],"based":[103],"on":[104],"speech-assisted":[105],"text-to-video":[106,129],"alignment":[107,130,134],"masked":[109,153,169],"unit":[110,154,160,186],"prediction.":[111],"By":[112],"utilizing":[113],"ground-truth":[115],"available":[118],"training":[121],"phase,":[122],"construct":[124],"function":[127],"for":[128,184,193],"text-to-speech":[133],"obtained":[135],"by":[136,196],"pre-trained":[138],"TTS":[139],"model.":[140],"To":[141],"deal":[142],"frontal":[147],"images,":[149],"employ":[151],"prediction":[155],"so":[157],"predictor":[161,187],"proposed":[164,204],"estimate":[167],"units":[170],"rest":[173],"units.":[176],"addition,":[178],"modified":[180],"probability":[182],"distribution":[183],"learnable":[190],"null":[191],"embedding":[192],"inspired":[195],"classifier-free":[197],"guidance.":[198],"Experimental":[199],"results":[200],"demonstrated":[201],"our":[203],"method":[205],"outperformed":[206],"previous":[207],"systems":[209],"both":[211],"lip-speech":[212],"recognition":[216],"performance.":[217]},"counts_by_year":[{"year":2025,"cited_by_count":4}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
