{"id":"https://openalex.org/W4408345771","doi":"https://doi.org/10.1109/icassp49660.2025.10890684","title":"Improving Lip-synchrony in Direct Audio-Visual Speech-to-Speech Translation","display_name":"Improving Lip-synchrony in Direct Audio-Visual Speech-to-Speech Translation","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408345771","doi":"https://doi.org/10.1109/icassp49660.2025.10890684"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10890684","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890684","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5017966455","display_name":"Lucas Goncalves","orcid":"https://orcid.org/0000-0001-9613-1002"},"institutions":[{"id":"https://openalex.org/I4210089985","display_name":"Amazon (Germany)","ror":"https://ror.org/00b9ktm87","country_code":"DE","type":"company","lineage":["https://openalex.org/I1311688040","https://openalex.org/I4210089985"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Lucas Goncalves","raw_affiliation_strings":["Amazon"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon","institution_ids":["https://openalex.org/I4210089985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057049978","display_name":"Prashant Mathur","orcid":"https://orcid.org/0000-0002-9271-1373"},"institutions":[{"id":"https://openalex.org/I4210089985","display_name":"Amazon (Germany)","ror":"https://ror.org/00b9ktm87","country_code":"DE","type":"company","lineage":["https://openalex.org/I1311688040","https://openalex.org/I4210089985"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Prashant Mathur","raw_affiliation_strings":["Amazon"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon","institution_ids":["https://openalex.org/I4210089985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069475690","display_name":"Xing Niu","orcid":"https://orcid.org/0000-0001-8834-792X"},"institutions":[{"id":"https://openalex.org/I4210089985","display_name":"Amazon (Germany)","ror":"https://ror.org/00b9ktm87","country_code":"DE","type":"company","lineage":["https://openalex.org/I1311688040","https://openalex.org/I4210089985"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Xing Niu","raw_affiliation_strings":["Amazon"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon","institution_ids":["https://openalex.org/I4210089985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058760668","display_name":"Chandrashekhar Lavania","orcid":null},"institutions":[{"id":"https://openalex.org/I4210089985","display_name":"Amazon (Germany)","ror":"https://ror.org/00b9ktm87","country_code":"DE","type":"company","lineage":["https://openalex.org/I1311688040","https://openalex.org/I4210089985"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Chandrashekhar Lavania","raw_affiliation_strings":["Amazon"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon","institution_ids":["https://openalex.org/I4210089985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019745645","display_name":"B. Houston","orcid":"https://orcid.org/0000-0001-5666-3629"},"institutions":[{"id":"https://openalex.org/I4210089985","display_name":"Amazon (Germany)","ror":"https://ror.org/00b9ktm87","country_code":"DE","type":"company","lineage":["https://openalex.org/I1311688040","https://openalex.org/I4210089985"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Brady Houston","raw_affiliation_strings":["Amazon"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon","institution_ids":["https://openalex.org/I4210089985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080301608","display_name":"Srikanth Vishnubhotla","orcid":null},"institutions":[{"id":"https://openalex.org/I4210089985","display_name":"Amazon (Germany)","ror":"https://ror.org/00b9ktm87","country_code":"DE","type":"company","lineage":["https://openalex.org/I1311688040","https://openalex.org/I4210089985"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Srikanth Vishnubhotla","raw_affiliation_strings":["Amazon"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon","institution_ids":["https://openalex.org/I4210089985"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Lijia Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I4210089985","display_name":"Amazon (Germany)","ror":"https://ror.org/00b9ktm87","country_code":"DE","type":"company","lineage":["https://openalex.org/I1311688040","https://openalex.org/I4210089985"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Lijia Sun","raw_affiliation_strings":["Amazon"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon","institution_ids":["https://openalex.org/I4210089985"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5009637956","display_name":"Anthony Ferritto","orcid":null},"institutions":[{"id":"https://openalex.org/I4210089985","display_name":"Amazon (Germany)","ror":"https://ror.org/00b9ktm87","country_code":"DE","type":"company","lineage":["https://openalex.org/I1311688040","https://openalex.org/I4210089985"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Anthony Ferritto","raw_affiliation_strings":["Amazon"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon","institution_ids":["https://openalex.org/I4210089985"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5017966455"],"corresponding_institution_ids":["https://openalex.org/I4210089985"],"apc_list":null,"apc_paid":null,"fwci":2.478,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.86805556,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9681000113487244,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9681000113487244,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9362000226974487,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7516515254974365},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7444555759429932},{"id":"https://openalex.org/keywords/speech-translation","display_name":"Speech translation","score":0.6760929822921753},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.5427480340003967},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.53557950258255},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.472512811422348},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.4539967179298401},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.3395863175392151},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.2958788275718689},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.12165522575378418}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7516515254974365},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7444555759429932},{"id":"https://openalex.org/C2780366754","wikidata":"https://www.wikidata.org/wiki/Q7494857","display_name":"Speech translation","level":3,"score":0.6760929822921753},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.5427480340003967},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.53557950258255},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.472512811422348},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.4539967179298401},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.3395863175392151},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2958788275718689},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.12165522575378418},{"id":"https://openalex.org/C105580179","wikidata":"https://www.wikidata.org/wiki/Q188928","display_name":"Messenger RNA","level":3,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10890684","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890684","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W2101105183","https://openalex.org/W2604379605","https://openalex.org/W2787603175","https://openalex.org/W2890952074","https://openalex.org/W2971634123","https://openalex.org/W3000442796","https://openalex.org/W3081492798","https://openalex.org/W3180374548","https://openalex.org/W3205904637","https://openalex.org/W4224931655","https://openalex.org/W4225302758","https://openalex.org/W4297841628","https://openalex.org/W4382357812","https://openalex.org/W4383337695","https://openalex.org/W4385570538","https://openalex.org/W4385822664","https://openalex.org/W4390873467","https://openalex.org/W4402716129","https://openalex.org/W4402980407","https://openalex.org/W6731064840","https://openalex.org/W6757817989","https://openalex.org/W6779607597","https://openalex.org/W6783867762","https://openalex.org/W6803063772","https://openalex.org/W6810168380","https://openalex.org/W6850505448","https://openalex.org/W6855650468","https://openalex.org/W6864518445"],"related_works":["https://openalex.org/W2338806053","https://openalex.org/W4385571610","https://openalex.org/W39235475","https://openalex.org/W1540615732","https://openalex.org/W2164147372","https://openalex.org/W2550171623","https://openalex.org/W4253660971","https://openalex.org/W4360995948","https://openalex.org/W1909292483","https://openalex.org/W1428730622"],"abstract_inverted_index":{"Audio-Visual":[0],"Speech-to-Speech":[1],"Translation":[2],"(AVS2S)":[3],"typically":[4],"prioritizes":[5],"improving":[6],"translation":[7,123],"quality":[8,108],"and":[9,106],"naturalness.":[10],"However,":[11],"an":[12,81],"equally":[13],"critical":[14],"aspect":[15],"in":[16,34,45,75,91,122],"audio-visual":[17,77],"content":[18],"is":[19],"lip-synchrony\u2014ensuring":[20],"that":[21],"the":[22,25,28,40,63,104,110,116],"movements":[23],"of":[24,42,66,85,109],"lips":[26],"match":[27],"spoken":[29],"content\u2014essential":[30],"for":[31],"maintaining":[32],"realism":[33],"dubbed":[35],"videos.":[36],"Despite":[37],"its":[38],"importance,":[39],"inclusion":[41],"lip-synchrony":[43,60,74],"constraints":[44],"AVS2S":[46,67],"models":[47],"has":[48],"been":[49],"largely":[50],"overlooked.":[51],"This":[52],"study":[53],"addresses":[54],"this":[55],"gap":[56],"by":[57],"integrating":[58],"a":[59,88,94],"loss":[61],"into":[62],"training":[64],"process":[65],"models.":[68],"Our":[69],"proposed":[70],"method":[71],"significantly":[72],"enhances":[73],"direct":[76],"speechto-speech":[78],"translation,":[79],"achieving":[80],"average":[82],"LSE-D":[83,92],"score":[84],"10.67,":[86],"representing":[87],"9.2%":[89],"reduction":[90],"over":[93],"strong":[95],"baseline":[96],"across":[97],"four":[98],"language":[99],"pairs.":[100],"Additionally,":[101],"it":[102],"maintains":[103],"naturalness":[105],"high":[107],"translated":[111],"speech":[112],"when":[113],"overlaid":[114],"onto":[115],"original":[117],"video,":[118],"without":[119],"any":[120],"degradation":[121],"quality.":[124]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2025-10-10T00:00:00"}
