{"id":"https://openalex.org/W4387968393","doi":"https://doi.org/10.1145/3581783.3612424","title":"Uncertainty-Guided End-to-End Audio-Visual Speaker Diarization for Far-Field Recordings","display_name":"Uncertainty-Guided End-to-End Audio-Visual Speaker Diarization for Far-Field Recordings","publication_year":2023,"publication_date":"2023-10-26","ids":{"openalex":"https://openalex.org/W4387968393","doi":"https://doi.org/10.1145/3581783.3612424"},"language":"en","primary_location":{"id":"doi:10.1145/3581783.3612424","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3612424","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103234628","display_name":"Chenyu Yang","orcid":"https://orcid.org/0009-0007-1697-9216"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chenyu Yang","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019915501","display_name":"Mengxi Chen","orcid":"https://orcid.org/0009-0006-6790-6490"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengxi Chen","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100645705","display_name":"Yanfeng Wang","orcid":"https://orcid.org/0000-0002-3196-2347"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanfeng Wang","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai AI Laboratory, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai AI Laboratory, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100445125","display_name":"Yu Wang","orcid":"https://orcid.org/0000-0001-9500-081X"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yu Wang","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai AI Laboratory, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai AI Laboratory, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5103234628"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.3924,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.5789376,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"4031","last_page":"4041"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.8353649377822876},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8130674362182617},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6891082525253296},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5781704783439636},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.521973729133606},{"id":"https://openalex.org/keywords/channel","display_name":"Channel (broadcasting)","score":0.48529261350631714},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.47617048025131226},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.43182095885276794},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.4285500645637512},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3664548993110657},{"id":"https://openalex.org/keywords/noise-reduction","display_name":"Noise reduction","score":0.1921423375606537},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.11032238602638245},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.07713598012924194}],"concepts":[{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.8353649377822876},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8130674362182617},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6891082525253296},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5781704783439636},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.521973729133606},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.48529261350631714},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.47617048025131226},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.43182095885276794},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.4285500645637512},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3664548993110657},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.1921423375606537},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.11032238602638245},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.07713598012924194},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3581783.3612424","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3612424","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1121271761","display_name":null,"funder_award_id":"Program","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G407523942","display_name":null,"funder_award_id":"62106140","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4532141107","display_name":null,"funder_award_id":"21511101100","funder_id":"https://openalex.org/F4320321885","funder_display_name":"Science and Technology Commission of Shanghai Municipality"},{"id":"https://openalex.org/G5939423041","display_name":null,"funder_award_id":"Technology","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6833792719","display_name":null,"funder_award_id":"11101100","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320321885","display_name":"Science and Technology Commission of Shanghai Municipality","ror":"https://ror.org/03kt66j61"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":48,"referenced_works":["https://openalex.org/W1013543043","https://openalex.org/W1965819578","https://openalex.org/W1971901154","https://openalex.org/W2002979515","https://openalex.org/W2081074144","https://openalex.org/W2127211243","https://openalex.org/W2148613904","https://openalex.org/W2150769028","https://openalex.org/W2316138215","https://openalex.org/W2460742184","https://openalex.org/W2604379605","https://openalex.org/W2606429533","https://openalex.org/W2734774145","https://openalex.org/W2759799350","https://openalex.org/W2767014232","https://openalex.org/W2889381673","https://openalex.org/W2969985801","https://openalex.org/W2972729214","https://openalex.org/W2991553872","https://openalex.org/W2991853946","https://openalex.org/W3008762051","https://openalex.org/W3009086519","https://openalex.org/W3015834770","https://openalex.org/W3015841457","https://openalex.org/W3016011581","https://openalex.org/W3016098309","https://openalex.org/W3025260599","https://openalex.org/W3034552680","https://openalex.org/W3035376925","https://openalex.org/W3038871978","https://openalex.org/W3041847644","https://openalex.org/W3095212884","https://openalex.org/W3119269912","https://openalex.org/W3128434617","https://openalex.org/W3140898556","https://openalex.org/W3162770427","https://openalex.org/W3169351047","https://openalex.org/W3178462146","https://openalex.org/W3203700770","https://openalex.org/W3206008172","https://openalex.org/W3212886388","https://openalex.org/W4225661121","https://openalex.org/W4286378963","https://openalex.org/W4296069335","https://openalex.org/W4297841834","https://openalex.org/W4301409532","https://openalex.org/W4304084053","https://openalex.org/W4312946813"],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W4247736853","https://openalex.org/W2162158162","https://openalex.org/W1493012537","https://openalex.org/W1999004162","https://openalex.org/W2175373321","https://openalex.org/W2125642021","https://openalex.org/W1521049138","https://openalex.org/W2938358845","https://openalex.org/W2997340161"],"abstract_inverted_index":{"Audio-visual":[0],"speaker":[1,61,83],"diarization":[2,62,98],"refers":[3],"to":[4,77,118,165,171],"the":[5,42,89,92,116,120,130,140],"task":[6],"of":[7,44,91,132,152],"identifying":[8],"\"who":[9],"spoke":[10],"when\"":[11],"by":[12],"using":[13],"both":[14,71,175],"audio":[15,113,154],"and":[16,37,74,81,155,177],"video":[17,156],"data.":[18,157],"Although":[19],"previous":[20],"fusion-based":[21],"approaches":[22],"have":[23,31,38],"shown":[24],"exceptional":[25],"performance":[26,99,168],"over":[27],"audio-only":[28],"methods,":[29],"they":[30],"mainly":[32],"focused":[33],"on":[34,139],"high-quality":[35],"data":[36],"not":[39],"accounted":[40],"for":[41,174],"impacts":[43],"acoustic":[45],"noise":[46],"or":[47,103],"missing":[48],"faces.":[49],"To":[50,128],"address":[51],"these":[52],"limitations,":[53],"we":[54,135],"propose":[55],"a":[56,124],"novel":[57],"uncertainty-aware":[58],"end-to-end":[59],"audio-visual":[60],"(UAV-SD)":[63],"approach":[64,69,108],"in":[65,101,183],"this":[66],"paper.":[67],"Our":[68],"leverages":[70],"framewise":[72],"inter-":[73],"intra-modal":[75],"confidence":[76],"achieve":[78,96],"more":[79,125],"effective":[80],"robust":[82],"diarization.":[84],"By":[85],"taking":[86],"into":[87],"account":[88],"uncertainty":[90],"data,":[93,179],"UAV-SD":[94,162],"can":[95],"better":[97],"even":[100],"noisy":[102],"low-quality":[104],"recordings.":[105],"Additionally,":[106],"our":[107,133],"is":[109,163],"compatible":[110],"with":[111],"multi-channel":[112,178],"signals":[114],"without":[115],"need":[117],"retrain":[119],"model,":[121],"making":[122],"it":[123],"versatile":[126],"solution.":[127],"evaluate":[129],"effectiveness":[131,182],"approach,":[134],"conduct":[136],"extensive":[137],"experiments":[138],"Multi-modal":[141],"Information":[142],"Based":[143],"Speech":[144],"Processing":[145],"(MISP)":[146],"2022":[147],"Challenge":[148],"datasets":[149],"which":[150],"consist":[151],"far-field":[153],"The":[158],"results":[159],"show":[160],"that":[161],"able":[164],"yield":[166],"significant":[167],"gains":[169],"compared":[170],"baseline":[172],"methods":[173],"single":[176],"demonstrating":[180],"its":[181],"real-world":[184],"scenarios.":[185]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
