{"id":"https://openalex.org/W4297841754","doi":"https://doi.org/10.21437/interspeech.2022-10568","title":"From Start to Finish: Latency Reduction Strategies for Incremental Speech Synthesis in Simultaneous Speech-to-Speech Translation","display_name":"From Start to Finish: Latency Reduction Strategies for Incremental Speech Synthesis in Simultaneous Speech-to-Speech Translation","publication_year":2022,"publication_date":"2022-09-16","ids":{"openalex":"https://openalex.org/W4297841754","doi":"https://doi.org/10.21437/interspeech.2022-10568"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2022-10568","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-10568","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://cris.maastrichtuniversity.nl/en/publications/e08ba8cd-72b3-4e20-99da-ff46d211ea7e","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5076306018","display_name":"Danni Liu","orcid":"https://orcid.org/0000-0001-5419-1963"},"institutions":[{"id":"https://openalex.org/I34352273","display_name":"Maastricht University","ror":"https://ror.org/02jz4aj89","country_code":"NL","type":"education","lineage":["https://openalex.org/I34352273"]}],"countries":["NL"],"is_corresponding":true,"raw_author_name":"Danni Liu","raw_affiliation_strings":["Maastricht University, The Netherlands"],"affiliations":[{"raw_affiliation_string":"Maastricht University, The Netherlands","institution_ids":["https://openalex.org/I34352273"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087491225","display_name":"Changhan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Changhan Wang","raw_affiliation_strings":["Meta AI, USA"],"affiliations":[{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103000746","display_name":"Hongyu Gong","orcid":"https://orcid.org/0000-0002-7071-0070"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hongyu Gong","raw_affiliation_strings":["Meta AI, USA"],"affiliations":[{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054878003","display_name":"Xutai Ma","orcid":null},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xutai Ma","raw_affiliation_strings":["Johns Hopkins University, USA","Meta AI, USA"],"affiliations":[{"raw_affiliation_string":"Johns Hopkins University, USA","institution_ids":["https://openalex.org/I145311948"]},{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052049405","display_name":"Yun Tang","orcid":"https://orcid.org/0000-0003-2340-1109"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yun Tang","raw_affiliation_strings":["Meta AI, USA"],"affiliations":[{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5058915697","display_name":"Juan Pino","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Juan Pino","raw_affiliation_strings":["Meta AI, USA"],"affiliations":[{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5076306018"],"corresponding_institution_ids":["https://openalex.org/I34352273"],"apc_list":null,"apc_paid":null,"fwci":0.1039,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.29180134,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1771","last_page":"1775"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8022757768630981},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.7562793493270874},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7437394857406616},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.6453943252563477},{"id":"https://openalex.org/keywords/speech-translation","display_name":"Speech translation","score":0.5467851758003235},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.5044993162155151},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.4956461191177368},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.44649022817611694},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.42632976174354553},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.41495418548583984},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2819403111934662},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.25455349683761597},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.07582792639732361},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.07399916648864746},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.07121008634567261}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8022757768630981},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.7562793493270874},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7437394857406616},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.6453943252563477},{"id":"https://openalex.org/C2780366754","wikidata":"https://www.wikidata.org/wiki/Q7494857","display_name":"Speech translation","level":3,"score":0.5467851758003235},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.5044993162155151},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.4956461191177368},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.44649022817611694},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.42632976174354553},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.41495418548583984},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2819403111934662},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.25455349683761597},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.07582792639732361},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.07399916648864746},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.07121008634567261},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C105580179","wikidata":"https://www.wikidata.org/wiki/Q188928","display_name":"Messenger RNA","level":3,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.21437/interspeech.2022-10568","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-10568","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"},{"id":"pmh:oai:cris.maastrichtuniversity.nl:openaire/e08ba8cd-72b3-4e20-99da-ff46d211ea7e","is_oa":true,"landing_page_url":"https://cris.maastrichtuniversity.nl/en/publications/e08ba8cd-72b3-4e20-99da-ff46d211ea7e","pdf_url":null,"source":{"id":"https://openalex.org/S4306402616","display_name":"Research Publications (Maastricht University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I34352273","host_organization_name":"Maastricht University","host_organization_lineage":["https://openalex.org/I34352273"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Liu, D, Wang, C, Gong, H, Ma, X, Tang, Y & Pino, J 2022, From Start to Finish: Latency Reduction Strategies for Incremental Speech Synthesis in Simultaneous Speech-to-Speech Translation. in Proceedings of INTERSPEECH 2022. vol. 2022-September, International Speech Communication Association (ISCA), Interspeech, pp. 1771-1775, 23rd Annual Conference of the International Speech Communication Association, Incheon, Korea, Republic of, 18/09/22. https://doi.org/10.21437/Interspeech.2022-10568","raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":{"id":"pmh:oai:cris.maastrichtuniversity.nl:openaire/e08ba8cd-72b3-4e20-99da-ff46d211ea7e","is_oa":true,"landing_page_url":"https://cris.maastrichtuniversity.nl/en/publications/e08ba8cd-72b3-4e20-99da-ff46d211ea7e","pdf_url":null,"source":{"id":"https://openalex.org/S4306402616","display_name":"Research Publications (Maastricht University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I34352273","host_organization_name":"Maastricht University","host_organization_lineage":["https://openalex.org/I34352273"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Liu, D, Wang, C, Gong, H, Ma, X, Tang, Y & Pino, J 2022, From Start to Finish: Latency Reduction Strategies for Incremental Speech Synthesis in Simultaneous Speech-to-Speech Translation. in Proceedings of INTERSPEECH 2022. vol. 2022-September, International Speech Communication Association (ISCA), Interspeech, pp. 1771-1775, 23rd Annual Conference of the International Speech Communication Association, Incheon, Korea, Republic of, 18/09/22. https://doi.org/10.21437/Interspeech.2022-10568","raw_type":"info:eu-repo/semantics/publishedVersion"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W1964771471","https://openalex.org/W2097203679","https://openalex.org/W2136545725","https://openalex.org/W2250981492","https://openalex.org/W2396366106","https://openalex.org/W2747874407","https://openalex.org/W2951456627","https://openalex.org/W2962784628","https://openalex.org/W2963250244","https://openalex.org/W2963532001","https://openalex.org/W2963609956","https://openalex.org/W2964243274","https://openalex.org/W2972802841","https://openalex.org/W2986976179","https://openalex.org/W3007068036","https://openalex.org/W3012492057","https://openalex.org/W3030437843","https://openalex.org/W3033411150","https://openalex.org/W3036601975","https://openalex.org/W3048023795","https://openalex.org/W3083224111","https://openalex.org/W3092028330","https://openalex.org/W3092424727","https://openalex.org/W3115075512","https://openalex.org/W3161782335","https://openalex.org/W3186843219","https://openalex.org/W3197407562","https://openalex.org/W3198429080","https://openalex.org/W4206534379","https://openalex.org/W4226021270"],"related_works":["https://openalex.org/W2338806053","https://openalex.org/W4385571610","https://openalex.org/W4223977554","https://openalex.org/W2293738010","https://openalex.org/W2164147372","https://openalex.org/W642007152","https://openalex.org/W4253660971","https://openalex.org/W2401827384","https://openalex.org/W2550171623","https://openalex.org/W596245619"],"abstract_inverted_index":{"Speech-to-speech":[0],"translation":[1,25,128],"(S2ST)":[2],"converts":[3],"input":[4],"speech":[5,7,27,67,76,90,127],"to":[6,48,69],"in":[8,16],"another":[9],"language.":[10],"A":[11],"challenge":[12],"of":[13,61,88],"delivering":[14],"S2ST":[15],"real":[17],"time":[18,60],"is":[19],"the":[20,24,57,65,75,80,86],"accumulated":[21],"delay":[22],"between":[23],"and":[26,105],"synthesis":[28],"modules.":[29],"While":[30],"recently":[31],"incremental":[32],"text-to-speech":[33],"(iTTS)":[34],"models":[35],"have":[36],"shown":[37],"large":[38],"quality":[39],"improvements,":[40],"they":[41],"typically":[42],"require":[43],"additional":[44],"future":[45],"text":[46],"inputs":[47],"reach":[49],"optimal":[50],"performance.":[51],"In":[52],"this":[53,100],"work,":[54],"we":[55,83],"minimize":[56],"initial":[58,81],"waiting":[59],"iTTS":[62],"by":[63,122],"adapting":[64],"upstream":[66],"translator":[68],"generate":[70],"high-quality":[71],"pseudo":[72],"lookahead":[73],"for":[74,114],"synthesizer.":[77],"After":[78],"mitigating":[79],"delay,":[82],"demonstrate":[84],"that":[85],"duration":[87],"synthesized":[89],"also":[91],"plays":[92],"a":[93,102,108],"crucial":[94],"role":[95],"on":[96],"latency.":[97],"We":[98],"formalize":[99],"as":[101],"latency":[103,115,121],"metric":[104],"then":[106],"present":[107],"simple":[109],"yet":[110],"effective":[111],"duration-scaling":[112],"approach":[113],"reduction.":[116],"Our":[117],"approaches":[118],"consistently":[119],"reduce":[120],"0.2-0.5":[123],"second":[124],"without":[125],"sacrificing":[126],"quality.":[129]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
