{"id":"https://openalex.org/W3207452728","doi":"https://doi.org/10.1109/asru51503.2021.9687974","title":"A Comparative Study of Modular and Joint Approaches for Speaker-Attributed ASR on Monaural Long-Form Audio","display_name":"A Comparative Study of Modular and Joint Approaches for Speaker-Attributed ASR on Monaural Long-Form Audio","publication_year":2021,"publication_date":"2021-12-13","ids":{"openalex":"https://openalex.org/W3207452728","doi":"https://doi.org/10.1109/asru51503.2021.9687974","mag":"3207452728"},"language":"en","primary_location":{"id":"doi:10.1109/asru51503.2021.9687974","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru51503.2021.9687974","pdf_url":null,"source":{"id":"https://openalex.org/S4363606113","display_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5016279564","display_name":"Naoyuki Kanda","orcid":"https://orcid.org/0000-0002-8628-3288"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Naoyuki Kanda","raw_affiliation_strings":["Microsoft Corp.,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corp.,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057945477","display_name":"Xiong Xiao","orcid":"https://orcid.org/0000-0003-4471-7946"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiong Xiao","raw_affiliation_strings":["Microsoft Corp.,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corp.,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029600413","display_name":"Jian Wu","orcid":"https://orcid.org/0000-0002-6325-0766"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jian Wu","raw_affiliation_strings":["Microsoft Corp.,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corp.,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005521984","display_name":"Tianyan Zhou","orcid":"https://orcid.org/0000-0003-2583-8080"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tianyan Zhou","raw_affiliation_strings":["Microsoft Corp.,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corp.,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034136587","display_name":"Yashesh Gaur","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yashesh Gaur","raw_affiliation_strings":["Microsoft Corp.,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corp.,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Xiaofei Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiaofei Wang","raw_affiliation_strings":["Microsoft Corp.,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corp.,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051392570","display_name":"Ziyang Meng","orcid":"https://orcid.org/0000-0002-3742-0039"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhong Meng","raw_affiliation_strings":["Microsoft Corp.,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corp.,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhuo Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhuo Chen","raw_affiliation_strings":["Microsoft Corp.,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corp.,USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5032260110","display_name":"Takuya Yoshioka","orcid":"https://orcid.org/0000-0001-7175-2435"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Takuya Yoshioka","raw_affiliation_strings":["Microsoft Corp.,USA"],"affiliations":[{"raw_affiliation_string":"Microsoft Corp.,USA","institution_ids":["https://openalex.org/I1290206253"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5016279564"],"corresponding_institution_ids":["https://openalex.org/I1290206253"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.48275862,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"296","last_page":"303"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13497","display_name":"Hermeneutics and Narrative Identity","score":0.9879000186920166,"subfield":{"id":"https://openalex.org/subfields/1211","display_name":"Philosophy"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13497","display_name":"Hermeneutics and Narrative Identity","score":0.9879000186920166,"subfield":{"id":"https://openalex.org/subfields/1211","display_name":"Philosophy"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T13695","display_name":"Aging, Elder Care, and Social Issues","score":0.9749000072479248,"subfield":{"id":"https://openalex.org/subfields/3600","display_name":"General Health Professions"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T13099","display_name":"Health, Medicine and Society","score":0.95660001039505,"subfield":{"id":"https://openalex.org/subfields/3600","display_name":"General Health Professions"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/monaural","display_name":"Monaural","score":0.9349876642227173},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8275570869445801},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.7857699990272522},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7220635414123535},{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.6610637903213501},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6281329989433289},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.5817716717720032},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.5256025791168213},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.41845738887786865},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.37320297956466675},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.37286195158958435},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.08508837223052979}],"concepts":[{"id":"https://openalex.org/C102894143","wikidata":"https://www.wikidata.org/wiki/Q1323979","display_name":"Monaural","level":2,"score":0.9349876642227173},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8275570869445801},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.7857699990272522},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7220635414123535},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.6610637903213501},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6281329989433289},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.5817716717720032},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.5256025791168213},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.41845738887786865},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37320297956466675},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.37286195158958435},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.08508837223052979},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C170154142","wikidata":"https://www.wikidata.org/wiki/Q150737","display_name":"Architectural engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru51503.2021.9687974","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru51503.2021.9687974","pdf_url":null,"source":{"id":"https://openalex.org/S4363606113","display_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":55,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1591607137","https://openalex.org/W1876378865","https://openalex.org/W2046056978","https://openalex.org/W2125336414","https://openalex.org/W2148613904","https://openalex.org/W2398972335","https://openalex.org/W2407080277","https://openalex.org/W2696967604","https://openalex.org/W2729190387","https://openalex.org/W2752782242","https://openalex.org/W2796868621","https://openalex.org/W2808631503","https://openalex.org/W2884797218","https://openalex.org/W2939690918","https://openalex.org/W2952752702","https://openalex.org/W2963477857","https://openalex.org/W2963574857","https://openalex.org/W2963773971","https://openalex.org/W2963979492","https://openalex.org/W2972492143","https://openalex.org/W2973127116","https://openalex.org/W2997419692","https://openalex.org/W3000358149","https://openalex.org/W3007946293","https://openalex.org/W3008283340","https://openalex.org/W3008762051","https://openalex.org/W3015746570","https://openalex.org/W3015834770","https://openalex.org/W3016232124","https://openalex.org/W3016244460","https://openalex.org/W3017474798","https://openalex.org/W3020336359","https://openalex.org/W3025260599","https://openalex.org/W3094831814","https://openalex.org/W3095822285","https://openalex.org/W3097643313","https://openalex.org/W3097777922","https://openalex.org/W3109079702","https://openalex.org/W3143843080","https://openalex.org/W3154262773","https://openalex.org/W3160133086","https://openalex.org/W3162347631","https://openalex.org/W3162354890","https://openalex.org/W3162847598","https://openalex.org/W3163842642","https://openalex.org/W3181056718","https://openalex.org/W3196194966","https://openalex.org/W3212886388","https://openalex.org/W6631362777","https://openalex.org/W6740167877","https://openalex.org/W6744711567","https://openalex.org/W6761176859","https://openalex.org/W6775489429","https://openalex.org/W6787439801"],"related_works":["https://openalex.org/W11798771","https://openalex.org/W2308727","https://openalex.org/W13487902","https://openalex.org/W12391643","https://openalex.org/W11060696","https://openalex.org/W13110487","https://openalex.org/W8499301","https://openalex.org/W12256131","https://openalex.org/W1577664","https://openalex.org/W14868893"],"abstract_inverted_index":{"Speaker-attributed":[0],"automatic":[1],"speech":[2,26],"recognition":[3],"(SA-ASR)":[4],"is":[5],"a":[6,113,137],"task":[7],"to":[8,150,170],"recognize":[9],"\u201cwho":[10],"spoke":[11],"what\u201d":[12],"from":[13],"multi-talker":[14],"recordings.":[15,77],"An":[16],"SA-ASR":[17,43,73,81,120],"system":[18,143,154,158],"usually":[19],"consists":[20],"of":[21,66,98],"multiple":[22],"modules":[23],"such":[24,67,162],"as":[25],"separation,":[27],"speaker":[28,107,123],"diarization":[29],"and":[30,69,86,102],"ASR.":[31],"On":[32],"the":[33,37,64,103,118,128,141,151,156,172,176],"other":[34],"hand,":[35],"considering":[36],"joint":[38,70,87,142],"optimization,":[39],"an":[40],"end-to-end":[41],"(E2E)":[42],"model":[44,121],"has":[45],"recently":[46],"been":[47],"proposed":[48],"with":[49,136],"promising":[50],"results":[51],"on":[52,63,74,127],"simulation":[53],"data.":[54],"In":[55],"this":[56],"paper,":[57],"we":[58],"present":[59],"our":[60],"recent":[61],"study":[62],"comparison":[65],"modular":[68,85,153,157],"approaches":[71,88],"towards":[72],"real":[75,139],"monaural":[76,177],"We":[78,110,164],"develop":[79],"state-of-the-art":[80],"systems":[82],"for":[83,106,175],"both":[84],"by":[89],"leveraging":[90],"large-scale":[91],"training":[92,100],"data,":[93,140],"including":[94],"75":[95],"thousand":[96],"hours":[97],"ASR":[99],"data":[101],"VoxCeleb":[104],"corpus":[105,131],"representation":[108],"learning.":[109],"also":[111,165],"propose":[112],"new":[114],"pipeline":[115],"that":[116,133],"performs":[117,144,159],"E2E":[119],"after":[122,134],"clustering.":[124],"Our":[125],"evaluation":[126],"AMI":[129],"meeting":[130],"reveals":[132],"fine-tuning":[135],"small":[138],"8.9-29.9%":[145],"better":[146,160],"in":[147],"accuracy":[148],"compared":[149],"best":[152],"while":[155],"before":[161],"fine-tuning.":[163],"conduct":[166],"various":[167],"error":[168],"analyses":[169],"show":[171],"remaining":[173],"issues":[174],"SA-ASR.":[178]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2021-10-25T00:00:00"}
