{"id":"https://openalex.org/W7148387564","doi":"https://doi.org/10.1109/asru65441.2025.11434752","title":"Streaming Endpointer for Spoken Dialogue using Neural Audio Codecs and Label-Delayed Training","display_name":"Streaming Endpointer for Spoken Dialogue using Neural Audio Codecs and Label-Delayed Training","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7148387564","doi":"https://doi.org/10.1109/asru65441.2025.11434752"},"language":null,"primary_location":{"id":"doi:10.1109/asru65441.2025.11434752","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434752","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5023703313","display_name":"Sathvik Udupa","orcid":"https://orcid.org/0000-0002-2225-5464"},"institutions":[{"id":"https://openalex.org/I60587646","display_name":"Brno University of Technology","ror":"https://ror.org/03613d656","country_code":"CZ","type":"education","lineage":["https://openalex.org/I60587646"]}],"countries":["CZ"],"is_corresponding":true,"raw_author_name":"Sathvik Udupa","raw_affiliation_strings":["Brno University of Technology,Czechia"],"affiliations":[{"raw_affiliation_string":"Brno University of Technology,Czechia","institution_ids":["https://openalex.org/I60587646"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132789061","display_name":"Shinji Watanabe","orcid":null},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shinji Watanabe","raw_affiliation_strings":["Carnegie Mellon University,United States"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University,United States","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007660964","display_name":"Petr Schwarz","orcid":"https://orcid.org/0000-0002-2281-9637"},"institutions":[{"id":"https://openalex.org/I60587646","display_name":"Brno University of Technology","ror":"https://ror.org/03613d656","country_code":"CZ","type":"education","lineage":["https://openalex.org/I60587646"]}],"countries":["CZ"],"is_corresponding":false,"raw_author_name":"Petr Schwarz","raw_affiliation_strings":["Brno University of Technology,Czechia"],"affiliations":[{"raw_affiliation_string":"Brno University of Technology,Czechia","institution_ids":["https://openalex.org/I60587646"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5132804916","display_name":"Jan \u010cernock\u00fd","orcid":null},"institutions":[{"id":"https://openalex.org/I60587646","display_name":"Brno University of Technology","ror":"https://ror.org/03613d656","country_code":"CZ","type":"education","lineage":["https://openalex.org/I60587646"]}],"countries":["CZ"],"is_corresponding":false,"raw_author_name":"Jan Cernocky","raw_affiliation_strings":["Brno University of Technology,Czechia"],"affiliations":[{"raw_affiliation_string":"Brno University of Technology,Czechia","institution_ids":["https://openalex.org/I60587646"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5023703313"],"corresponding_institution_ids":["https://openalex.org/I60587646"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.87348764,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.48030000925064087,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.48030000925064087,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.28929999470710754,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.0608999989926815,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codec","display_name":"Codec","score":0.6675000190734863},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.6090999841690063},{"id":"https://openalex.org/keywords/cutoff","display_name":"Cutoff","score":0.5339999794960022},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.39419999718666077},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.3653999865055084},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.36169999837875366},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.35530000925064087},{"id":"https://openalex.org/keywords/spoken-language","display_name":"Spoken language","score":0.35440000891685486}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8055999875068665},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7366999983787537},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.6675000190734863},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.6090999841690063},{"id":"https://openalex.org/C2778217198","wikidata":"https://www.wikidata.org/wiki/Q556977","display_name":"Cutoff","level":2,"score":0.5339999794960022},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.39419999718666077},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3799000084400177},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.3653999865055084},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.36169999837875366},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.35530000925064087},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.35440000891685486},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.3418000042438507},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3188000023365021},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.3091999888420105},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3012000024318695},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.29899999499320984},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.2939999997615814},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.28630000352859497},{"id":"https://openalex.org/C6142545","wikidata":"https://www.wikidata.org/wiki/Q1455881","display_name":"Cutoff frequency","level":2,"score":0.28299999237060547},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.27059999108314514},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.26080000400543213}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru65441.2025.11434752","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434752","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321408","display_name":"Ministry of Education","ror":"https://ror.org/01p262204"},{"id":"https://openalex.org/F4320334322","display_name":"HORIZON EUROPE Framework Programme","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1932883564","https://openalex.org/W2100844198","https://openalex.org/W2129120544","https://openalex.org/W2625979394","https://openalex.org/W2792351646","https://openalex.org/W2891367150","https://openalex.org/W2892300106","https://openalex.org/W2935756939","https://openalex.org/W2979826702","https://openalex.org/W3015383801","https://openalex.org/W3015927303","https://openalex.org/W3081958278","https://openalex.org/W3198217962","https://openalex.org/W3215615641","https://openalex.org/W4281492411","https://openalex.org/W4296070362","https://openalex.org/W4297841778","https://openalex.org/W4297841817","https://openalex.org/W4372260277","https://openalex.org/W4372266509","https://openalex.org/W4372270198","https://openalex.org/W4375869207","https://openalex.org/W4385823227","https://openalex.org/W4391021778","https://openalex.org/W4391216255","https://openalex.org/W4392902922","https://openalex.org/W4402118927","https://openalex.org/W4402670057","https://openalex.org/W4402915905"],"related_works":[],"abstract_inverted_index":{"Accurate,":[0],"low-latency":[1],"endpointing":[2,24],"is":[3],"crucial":[4],"for":[5,25,79,85],"effective":[6],"spoken":[7],"dialogue":[8],"systems.":[9],"While":[10],"traditional":[11],"endpointers":[12],"often":[13],"rely":[14],"on":[15],"spectrum-based":[16],"audio":[17,42],"features,":[18,35],"this":[19],"work":[20],"proposes":[21],"real-time":[22],"speech":[23,102],"multi-turn":[26],"dialogues":[27],"using":[28],"streaming,":[29],"low-bitrate":[30],"Neural":[31],"Audio":[32],"Codec":[33],"(NAC)":[34],"building":[36],"upon":[37],"recent":[38],"advancements":[39],"in":[40],"neural":[41],"codecs.":[43],"To":[44],"further":[45],"reduce":[46],"cutoff":[47,75,117],"errors,":[48],"we":[49,94],"introduce":[50],"a":[51,58,80,86,99],"novel":[52],"label":[53,69],"delay":[54,70],"training":[55],"scheme.":[56],"At":[57],"fixed":[59],"median":[60,108],"latency":[61],"of":[62],"160":[63],"ms,":[64],"our":[65],"combined":[66],"NAC":[67],"and":[68,83,114],"approach":[71],"achieves":[72],"significant":[73],"relative":[74],"error":[76,118],"reductions:":[77],"42.7%":[78],"single-stream":[81],"endpointer":[82],"37.5%":[84],"two-stream":[87],"configuration,":[88],"compared":[89],"to":[90],"baseline":[91],"methods.":[92],"Finally,":[93],"demonstrate":[95],"efficient":[96],"integration":[97],"with":[98],"codec-based":[100],"pretrained":[101],"large":[103],"language":[104],"model,":[105],"improving":[106],"its":[107,116],"response":[109],"time":[110],"by":[111,119],"1200":[112],"ms":[113],"reducing":[115],"35%.":[120]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2026-04-03T00:00:00"}
