{"id":"https://openalex.org/W4408352742","doi":"https://doi.org/10.1109/icassp49660.2025.10889841","title":"META-CAT: Speaker-Informed Speech Embeddings via Meta Information Concatenation for Multi-talker ASR","display_name":"META-CAT: Speaker-Informed Speech Embeddings via Meta Information Concatenation for Multi-talker ASR","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408352742","doi":"https://doi.org/10.1109/icassp49660.2025.10889841"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10889841","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10889841","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101979902","display_name":"Jinhan Wang","orcid":"https://orcid.org/0000-0003-1930-2271"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Jinhan Wang","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100372965","display_name":"Weiqing Wang","orcid":"https://orcid.org/0000-0002-9578-819X"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Weiqing Wang","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073181369","display_name":"Kunal Dhawan","orcid":"https://orcid.org/0000-0002-5276-2475"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kunal Dhawan","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101511624","display_name":"Taejin Park","orcid":"https://orcid.org/0000-0003-2040-5884"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Taejin Park","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087820105","display_name":"Myungjong Kim","orcid":"https://orcid.org/0000-0003-0756-0177"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Myungjong Kim","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022587932","display_name":"Ivan Medennikov","orcid":"https://orcid.org/0000-0001-5381-3433"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ivan Medennikov","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100411301","display_name":"He Huang","orcid":"https://orcid.org/0000-0002-9217-4977"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"He Huang","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040007143","display_name":"Nithin Rao Koluguri","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nithin Koluguri","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040747392","display_name":"Jagadeesh Balam","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jagadeesh Balam","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5032957280","display_name":"Boris Ginsburg","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Boris Ginsburg","raw_affiliation_strings":["NVIDIA,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I4210127875"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5101979902"],"corresponding_institution_ids":["https://openalex.org/I4210127875"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.02020217,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9915000200271606,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9801999926567078,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/concatenation","display_name":"Concatenation (mathematics)","score":0.8981995582580566},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7450871467590332},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6825575828552246},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3761669397354126},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.33157867193222046},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.10605642199516296},{"id":"https://openalex.org/keywords/arithmetic","display_name":"Arithmetic","score":0.0672086775302887}],"concepts":[{"id":"https://openalex.org/C87619178","wikidata":"https://www.wikidata.org/wiki/Q126002","display_name":"Concatenation (mathematics)","level":2,"score":0.8981995582580566},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7450871467590332},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6825575828552246},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3761669397354126},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33157867193222046},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.10605642199516296},{"id":"https://openalex.org/C94375191","wikidata":"https://www.wikidata.org/wiki/Q11205","display_name":"Arithmetic","level":1,"score":0.0672086775302887}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10889841","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10889841","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.4099999964237213}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2060656758","https://openalex.org/W2144499799","https://openalex.org/W2900212944","https://openalex.org/W2963773971","https://openalex.org/W3020336359","https://openalex.org/W3094821064","https://openalex.org/W3097263872","https://openalex.org/W3154262773","https://openalex.org/W3178462146","https://openalex.org/W3196194966","https://openalex.org/W3204618061","https://openalex.org/W4226491018","https://openalex.org/W4372259971","https://openalex.org/W4385823037","https://openalex.org/W4391021542","https://openalex.org/W4392903223","https://openalex.org/W6873771331"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2373577936","https://openalex.org/W3095575180","https://openalex.org/W2389596151","https://openalex.org/W4221148444","https://openalex.org/W4226054107","https://openalex.org/W4387678054","https://openalex.org/W3204019825"],"abstract_inverted_index":{"We":[0,40],"propose":[1],"a":[2,27,35,59,113,116,134,144],"novel":[3],"end-to-end":[4,29,136],"multi-talker":[5,137],"automatic":[6],"speech":[7],"recognition":[8],"(ASR)":[9],"framework":[10,139],"that":[11,66,78,133],"enables":[12],"both":[13,71,86,124],"multi-speaker":[14],"(MS)":[15],"ASR":[16,49,138],"and":[17,73,88,126],"target-speaker":[18],"(TS)":[19],"ASR.":[20],"Our":[21,75],"proposed":[22,80],"model":[23,119],"is":[24],"trained":[25],"in":[26,85,157],"fully":[28],"manner,":[30],"incorporating":[31],"speaker":[32,37,56,153],"supervision":[33,57],"from":[34,54],"pre-trained":[36],"diarization":[38],"module.":[39],"introduce":[41],"an":[42],"intuitive":[43],"yet":[44],"effective":[45],"method":[46],"for":[47,94,150],"masking":[48,103],"encoder":[50],"activations":[51],"using":[52],"output":[53],"the":[55,79,92,105,148,151],"module,":[58],"technique":[60],"we":[61,111],"term":[62],"Meta-Cat":[63],"(meta-information":[64],"concatenation),":[65],"can":[67,121,140],"be":[68,141],"applied":[69],"to":[70],"MS-ASR":[72,87,125],"TS-ASR.":[74],"results":[76],"demonstrate":[77,112],"architecture":[81],"achieves":[82],"competitive":[83],"performance":[84],"TS-ASR":[89,127],"tasks,":[90],"without":[91],"need":[93,149],"traditional":[95],"methods,":[96],"such":[97],"as":[98],"neural":[99],"mask":[100],"estimation":[101],"or":[102,107],"at":[104],"audio":[106],"feature":[108],"level.":[109],"Furthermore,":[110],"glimpse":[114],"of":[115],"unified":[117],"dual-task":[118],"which":[120],"e\ufb03ciently":[122],"handle":[123],"tasks.":[128],"Thus,":[129],"this":[130],"work":[131],"illustrates":[132],"robust":[135],"implemented":[142],"with":[143],"streamlined":[145],"architecture,":[146],"obviating":[147],"complex":[152],"filtering":[154],"mechanisms":[155],"employed":[156],"previous":[158],"studies.":[159]},"counts_by_year":[],"updated_date":"2025-12-23T23:11:35.936235","created_date":"2025-10-10T00:00:00"}
