{"id":"https://openalex.org/W4406461869","doi":"https://doi.org/10.1109/slt61566.2024.10832330","title":"Serialized Speech Information Guidance with Overlapped Encoding Separation for Multi-Speaker Automatic Speech Recognition","display_name":"Serialized Speech Information Guidance with Overlapped Encoding Separation for Multi-Speaker Automatic Speech Recognition","publication_year":2024,"publication_date":"2024-12-02","ids":{"openalex":"https://openalex.org/W4406461869","doi":"https://doi.org/10.1109/slt61566.2024.10832330"},"language":"en","primary_location":{"id":"doi:10.1109/slt61566.2024.10832330","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832330","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5088361896","display_name":"Hao Shi","orcid":"https://orcid.org/0000-0003-3373-2147"},"institutions":[{"id":"https://openalex.org/I22299242","display_name":"Kyoto University","ror":"https://ror.org/02kpeqv85","country_code":"JP","type":"education","lineage":["https://openalex.org/I22299242"]},{"id":"https://openalex.org/I39012071","display_name":"Kyoto College of Graduate Studies for Informatics","ror":"https://ror.org/05mzj8a56","country_code":"JP","type":"education","lineage":["https://openalex.org/I39012071"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Hao Shi","raw_affiliation_strings":["Kyoto University,Graduate School of Informatics,Kyoto,Japan"],"affiliations":[{"raw_affiliation_string":"Kyoto University,Graduate School of Informatics,Kyoto,Japan","institution_ids":["https://openalex.org/I39012071","https://openalex.org/I22299242"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055954787","display_name":"Yuan Gao","orcid":"https://orcid.org/0000-0002-3526-5856"},"institutions":[{"id":"https://openalex.org/I22299242","display_name":"Kyoto University","ror":"https://ror.org/02kpeqv85","country_code":"JP","type":"education","lineage":["https://openalex.org/I22299242"]},{"id":"https://openalex.org/I39012071","display_name":"Kyoto College of Graduate Studies for Informatics","ror":"https://ror.org/05mzj8a56","country_code":"JP","type":"education","lineage":["https://openalex.org/I39012071"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yuan Gao","raw_affiliation_strings":["Kyoto University,Graduate School of Informatics,Kyoto,Japan"],"affiliations":[{"raw_affiliation_string":"Kyoto University,Graduate School of Informatics,Kyoto,Japan","institution_ids":["https://openalex.org/I39012071","https://openalex.org/I22299242"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031088292","display_name":"Zhaoheng Ni","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhaoheng Ni","raw_affiliation_strings":["Meta,New York,USA"],"affiliations":[{"raw_affiliation_string":"Meta,New York,USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5061781358","display_name":"Kawahara Tatsuya","orcid":"https://orcid.org/0000-0002-8474-7077"},"institutions":[{"id":"https://openalex.org/I22299242","display_name":"Kyoto University","ror":"https://ror.org/02kpeqv85","country_code":"JP","type":"education","lineage":["https://openalex.org/I22299242"]},{"id":"https://openalex.org/I39012071","display_name":"Kyoto College of Graduate Studies for Informatics","ror":"https://ror.org/05mzj8a56","country_code":"JP","type":"education","lineage":["https://openalex.org/I39012071"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Tatsuya Kawahara","raw_affiliation_strings":["Kyoto University,Graduate School of Informatics,Kyoto,Japan"],"affiliations":[{"raw_affiliation_string":"Kyoto University,Graduate School of Informatics,Kyoto,Japan","institution_ids":["https://openalex.org/I39012071","https://openalex.org/I22299242"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5088361896"],"corresponding_institution_ids":["https://openalex.org/I22299242","https://openalex.org/I39012071"],"apc_list":null,"apc_paid":null,"fwci":1.3781,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.85354825,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"193","last_page":"199"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.8286652565002441},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8036563396453857},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.6917202472686768},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.6492500901222229},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.5132560729980469},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.46450161933898926},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.4472198188304901},{"id":"https://openalex.org/keywords/separation","display_name":"Separation (statistics)","score":0.42609700560569763},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.4244905114173889},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.37340492010116577},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.053260982036590576}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.8286652565002441},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8036563396453857},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.6917202472686768},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.6492500901222229},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.5132560729980469},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.46450161933898926},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.4472198188304901},{"id":"https://openalex.org/C2776061190","wikidata":"https://www.wikidata.org/wiki/Q7451805","display_name":"Separation (statistics)","level":2,"score":0.42609700560569763},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.4244905114173889},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37340492010116577},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.053260982036590576}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/slt61566.2024.10832330","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832330","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":47,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1992475611","https://openalex.org/W2035576074","https://openalex.org/W2127141656","https://openalex.org/W2161742217","https://openalex.org/W2255466643","https://openalex.org/W2327501763","https://openalex.org/W2460742184","https://openalex.org/W2734774145","https://openalex.org/W2766219058","https://openalex.org/W2962866211","https://openalex.org/W2972541922","https://openalex.org/W3015746570","https://openalex.org/W3027008958","https://openalex.org/W3032969657","https://openalex.org/W3094821064","https://openalex.org/W3097777922","https://openalex.org/W3154262773","https://openalex.org/W3209984917","https://openalex.org/W3211278025","https://openalex.org/W4226390724","https://openalex.org/W4372267368","https://openalex.org/W4372342485","https://openalex.org/W4375869385","https://openalex.org/W4385822516","https://openalex.org/W4385822736","https://openalex.org/W4385822775","https://openalex.org/W4385823311","https://openalex.org/W4387698265","https://openalex.org/W4389984014","https://openalex.org/W4392903292","https://openalex.org/W4392903310","https://openalex.org/W4392904327","https://openalex.org/W4392904444","https://openalex.org/W4399168695","https://openalex.org/W4399389827","https://openalex.org/W4400667207","https://openalex.org/W4402111705","https://openalex.org/W6635078382","https://openalex.org/W6777776875","https://openalex.org/W6797091977","https://openalex.org/W6849997751","https://openalex.org/W6859349807","https://openalex.org/W6860554047","https://openalex.org/W6869090967","https://openalex.org/W6870275307","https://openalex.org/W6996806968"],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W2149220986","https://openalex.org/W1493012537","https://openalex.org/W4247736853","https://openalex.org/W2162158162","https://openalex.org/W1999004162","https://openalex.org/W2125642021","https://openalex.org/W1521049138","https://openalex.org/W642007152","https://openalex.org/W2056066842"],"abstract_inverted_index":{"Serialized":[0],"output":[1],"training":[2],"(SOT)":[3],"attracts":[4],"increasing":[5],"attention":[6,27,52,99],"due":[7],"to":[8,24,40,64,82,93,97,125],"its":[9],"convenience":[10],"and":[11,51,107,134,149,155,169,177],"flexibility":[12],"for":[13,173],"multi-speaker":[14,67],"automatic":[15],"speech":[16,77],"recognition":[17],"(ASR).":[18],"However,":[19],"it":[20],"is":[21,59],"not":[22],"easy":[23],"train":[25],"with":[26,69],"loss":[28,123],"only.":[29],"In":[30],"this":[31],"paper,":[32],"we":[33,73],"propose":[34,74],"the":[35,43,46,62,66,75,85,111,118,127,139,152,174],"overlapped":[36,119],"encoding":[37,113],"separation":[38],"(EncSep)":[39],"fully":[41],"utilize":[42,84],"benefits":[44],"of":[45,145],"connectionist":[47],"temporal":[48],"classification":[49],"(CTC)":[50],"(CTC-Attention)":[53],"hybrid":[54],"loss.":[55],"This":[56],"additional":[57],"separator":[58],"inserted":[60],"after":[61],"encoder":[63,128],"extract":[65],"information":[68,78,96],"CTC":[70,122],"losses.":[71],"Furthermore,":[72],"serialized":[76],"guidance":[79],"SOT":[80],"(GEncSep)":[81],"further":[83,161],"separated":[86,89,116],"encodings.":[87],"The":[88,102,121],"streams":[90],"are":[91],"concatenated":[92],"provide":[94],"single-speaker":[95,112],"guide":[98],"during":[100],"decoding.":[101],"experimental":[103],"results":[104],"on":[105,151],"Libri2Mix":[106,154,176],"Libri3Mix":[108,156,178],"show":[109],"that":[110],"can":[114],"be":[115],"from":[117],"encoding.":[120],"helps":[124],"improve":[126],"representation":[129],"under":[130],"complex":[131],"scenarios":[132],"(three-speaker":[133],"noisy":[135,153,175],"conditions),":[136],"which":[137,164],"makes":[138],"EncSep":[140],"have":[141],"a":[142],"relative":[143,171],"improvement":[144,172],"more":[146,166],"than":[147,167],"8%":[148],"6%":[150],"evaluation":[157,179],"sets,":[158],"respectively.":[159],"GEncSep":[160],"improved":[162],"performance,":[163],"was":[165],"12%":[168],"9%":[170],"sets.":[180]},"counts_by_year":[{"year":2025,"cited_by_count":4}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
