{"id":"https://openalex.org/W4296069345","doi":"https://doi.org/10.21437/interspeech.2022-10738","title":"Separator-Transducer-Segmenter: Streaming Recognition and Segmentation of Multi-party Speech","display_name":"Separator-Transducer-Segmenter: Streaming Recognition and Segmentation of Multi-party Speech","publication_year":2022,"publication_date":"2022-09-16","ids":{"openalex":"https://openalex.org/W4296069345","doi":"https://doi.org/10.21437/interspeech.2022-10738"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2022-10738","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-10738","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5064043969","display_name":"Ilya Sklyar","orcid":null},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ilya Sklyar","raw_affiliation_strings":["Amazon Alexa"],"affiliations":[{"raw_affiliation_string":"Amazon Alexa","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079151774","display_name":"Anna Piunova","orcid":null},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Anna Piunova","raw_affiliation_strings":["Amazon Alexa"],"affiliations":[{"raw_affiliation_string":"Amazon Alexa","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5068886220","display_name":"Christian Osendorfer","orcid":null},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Christian Osendorfer","raw_affiliation_strings":["Amazon Alexa"],"affiliations":[{"raw_affiliation_string":"Amazon Alexa","institution_ids":["https://openalex.org/I1311688040"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5064043969"],"corresponding_institution_ids":["https://openalex.org/I1311688040"],"apc_list":null,"apc_paid":null,"fwci":0.2079,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.413273,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"4451","last_page":"4455"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9937999844551086,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6898653507232666},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5793710947036743},{"id":"https://openalex.org/keywords/transducer","display_name":"Transducer","score":0.5790924429893494},{"id":"https://openalex.org/keywords/separator","display_name":"Separator (oil production)","score":0.49784421920776367},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.4913918077945709},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3488418161869049},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.24995499849319458},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.07125425338745117}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6898653507232666},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5793710947036743},{"id":"https://openalex.org/C56318395","wikidata":"https://www.wikidata.org/wiki/Q215928","display_name":"Transducer","level":2,"score":0.5790924429893494},{"id":"https://openalex.org/C185004128","wikidata":"https://www.wikidata.org/wiki/Q3011536","display_name":"Separator (oil production)","level":2,"score":0.49784421920776367},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.4913918077945709},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3488418161869049},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.24995499849319458},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.07125425338745117},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2022-10738","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2022-10738","pdf_url":null,"source":{"id":"https://openalex.org/S4363604309","display_name":"Interspeech 2022","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2022","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1828163288","https://openalex.org/W2404199887","https://openalex.org/W2460742184","https://openalex.org/W2468573742","https://openalex.org/W2507946458","https://openalex.org/W2884797218","https://openalex.org/W2890244912","https://openalex.org/W2936774411","https://openalex.org/W2943934118","https://openalex.org/W2962715207","https://openalex.org/W2963250244","https://openalex.org/W2963477857","https://openalex.org/W2963574857","https://openalex.org/W2963773971","https://openalex.org/W2963843276","https://openalex.org/W2995166068","https://openalex.org/W3015746570","https://openalex.org/W3015927303","https://openalex.org/W3015995734","https://openalex.org/W3016232124","https://openalex.org/W3032969657","https://openalex.org/W3097643313","https://openalex.org/W3109079702","https://openalex.org/W3143843080","https://openalex.org/W3162847598","https://openalex.org/W3163907627","https://openalex.org/W3200955206","https://openalex.org/W3203417382","https://openalex.org/W3212886388","https://openalex.org/W4224920409","https://openalex.org/W4226305408","https://openalex.org/W4226491018","https://openalex.org/W4287240341","https://openalex.org/W4287240843","https://openalex.org/W4287251377","https://openalex.org/W4288083483"],"related_works":["https://openalex.org/W4249175327","https://openalex.org/W2063487995","https://openalex.org/W2971636757","https://openalex.org/W2511745526","https://openalex.org/W4301098834","https://openalex.org/W2383601987","https://openalex.org/W1990106678","https://openalex.org/W2012283803","https://openalex.org/W1978784234","https://openalex.org/W4384820447"],"abstract_inverted_index":{"Streaming":[0],"recognition":[1,51,74,82],"and":[2,52,67,83,92,144],"segmentation":[3,53,62,72,84,125],"of":[4,16,48,127],"multi-party":[5,128],"conversations":[6,129],"with":[7,38,95,106],"overlapping":[8],"speech":[9,49,81,96],"is":[10],"crucial":[11],"for":[12,115,124],"the":[13,28,156],"next":[14],"generation":[15],"voice":[17],"assistant":[18],"applications.In":[19],"this":[20],"work":[21,30],"we":[22,58,77,104,119,137],"address":[23],"its":[24],"challenges":[25],"discovered":[26],"in":[27,54],"previous":[29],"on":[31,151],"multi-turn":[32],"recurrent":[33],"neural":[34],"network":[35],"transducer":[36],"(MT-RNN-T)":[37],"a":[39,55,60,121],"novel":[40,122],"approach,":[41],"separator-transducer-segmenter":[42],"(STS),":[43],"that":[44,70],"enables":[45],"tighter":[46],"integration":[47],"separation,":[50],"single":[56],"model.First,":[57],"propose":[59],"new":[61],"modeling":[63],"strategy":[64],"through":[65,86,130],"start-of-turn":[66],"end-of-turn":[68,107],"tokens":[69],"improves":[71],"without":[73],"accuracy":[75,85,142],"degradation.Second,":[76],"further":[78],"improve":[79,112],"both":[80],"an":[87,100],"emission":[88,108,131],"regularization":[89],"method,":[90],"FastEmit,":[91],"multi-task":[93],"training":[94,102],"activity":[97],"information":[98],"as":[99],"additional":[101],"signal.Third,":[103],"experiment":[105],"latency":[109,132],"penalty":[110],"to":[111,155],"end-point":[113],"detection":[114],"each":[116],"speaker":[117],"turn.Finally,":[118],"establish":[120],"framework":[123],"analysis":[126],"metrics.With":[133],"our":[134],"best":[135],"model,":[136],"report":[138],"4.6%":[139],"abs.turn":[140],"counting":[141],"improvement":[143,150],"17%":[145],"rel.word":[146],"error":[147],"rate":[148],"(WER)":[149],"Lib-riCSS":[152],"dataset":[153],"compared":[154],"previously":[157],"published":[158],"work.":[159]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2023,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
