{"id":"https://openalex.org/W4399447990","doi":"https://doi.org/10.21437/interspeech.2024-917","title":"ASoBO: Attentive Beamformer Selection for Distant Speaker Diarization in Meetings","display_name":"ASoBO: Attentive Beamformer Selection for Distant Speaker Diarization in Meetings","publication_year":2024,"publication_date":"2024-09-01","ids":{"openalex":"https://openalex.org/W4399447990","doi":"https://doi.org/10.21437/interspeech.2024-917"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2024-917","is_oa":false,"landing_page_url":"http://dx.doi.org/10.21437/interspeech.2024-917","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2024","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2406.03251","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5036816609","display_name":"Th\u00e9o Mariotte","orcid":"https://orcid.org/0000-0002-2108-101X"},"institutions":[{"id":"https://openalex.org/I4210108471","display_name":"Le Mans Universit\u00e9","ror":"https://ror.org/01mtcc283","country_code":"FR","type":"education","lineage":["https://openalex.org/I4210108471"]},{"id":"https://openalex.org/I1294671590","display_name":"Centre National de la Recherche Scientifique","ror":"https://ror.org/02feahw73","country_code":"FR","type":"funder","lineage":["https://openalex.org/I1294671590"]}],"countries":["FR"],"is_corresponding":true,"raw_author_name":"Th\u00e9o Mariotte","raw_affiliation_strings":["LIUM - Laboratoire d'Informatique de l'Universit\u00e9 du Mans (Avenue Laennec 72085 Le Mans cedex 9 - France)","LAUM - Laboratoire d'Acoustique de l'Universit\u00e9 du Mans (Laboratoire d'Acoustique de l'Universit\u00e9 du Mans, LAUM - UMR 6613 CNRS, Le Mans Universit\u00e9, \r\nAvenue Olivier Messiaen, 72085 LE MANS - France)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"LIUM - Laboratoire d'Informatique de l'Universit\u00e9 du Mans (Avenue Laennec 72085 Le Mans cedex 9 - France)","institution_ids":["https://openalex.org/I4210108471"]},{"raw_affiliation_string":"LAUM - Laboratoire d'Acoustique de l'Universit\u00e9 du Mans (Laboratoire d'Acoustique de l'Universit\u00e9 du Mans, LAUM - UMR 6613 CNRS, Le Mans Universit\u00e9, \r\nAvenue Olivier Messiaen, 72085 LE MANS - France)","institution_ids":["https://openalex.org/I1294671590","https://openalex.org/I4210108471"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002979461","display_name":"Anthony Larcher","orcid":"https://orcid.org/0000-0003-4398-0224"},"institutions":[{"id":"https://openalex.org/I4210108471","display_name":"Le Mans Universit\u00e9","ror":"https://ror.org/01mtcc283","country_code":"FR","type":"education","lineage":["https://openalex.org/I4210108471"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Anthony Larcher","raw_affiliation_strings":["LIUM - Laboratoire d'Informatique de l'Universit\u00e9 du Mans (Avenue Laennec 72085 Le Mans cedex 9 - France)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"LIUM - Laboratoire d'Informatique de l'Universit\u00e9 du Mans (Avenue Laennec 72085 Le Mans cedex 9 - France)","institution_ids":["https://openalex.org/I4210108471"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032144114","display_name":"Silvio Montr\u00e9sor","orcid":null},"institutions":[{"id":"https://openalex.org/I1294671590","display_name":"Centre National de la Recherche Scientifique","ror":"https://ror.org/02feahw73","country_code":"FR","type":"funder","lineage":["https://openalex.org/I1294671590"]},{"id":"https://openalex.org/I4210108471","display_name":"Le Mans Universit\u00e9","ror":"https://ror.org/01mtcc283","country_code":"FR","type":"education","lineage":["https://openalex.org/I4210108471"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Silvio Montr\u00e9sor","raw_affiliation_strings":["LAUM - Laboratoire d'Acoustique de l'Universit\u00e9 du Mans (Laboratoire d'Acoustique de l'Universit\u00e9 du Mans, LAUM - UMR 6613 CNRS, Le Mans Universit\u00e9, \r\nAvenue Olivier Messiaen, 72085 LE MANS - France)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"LAUM - Laboratoire d'Acoustique de l'Universit\u00e9 du Mans (Laboratoire d'Acoustique de l'Universit\u00e9 du Mans, LAUM - UMR 6613 CNRS, Le Mans Universit\u00e9, \r\nAvenue Olivier Messiaen, 72085 LE MANS - France)","institution_ids":["https://openalex.org/I1294671590","https://openalex.org/I4210108471"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103246472","display_name":"Jean\u2010Hugh Thomas","orcid":"https://orcid.org/0000-0003-3512-214X"},"institutions":[{"id":"https://openalex.org/I1294671590","display_name":"Centre National de la Recherche Scientifique","ror":"https://ror.org/02feahw73","country_code":"FR","type":"funder","lineage":["https://openalex.org/I1294671590"]},{"id":"https://openalex.org/I4210108471","display_name":"Le Mans Universit\u00e9","ror":"https://ror.org/01mtcc283","country_code":"FR","type":"education","lineage":["https://openalex.org/I4210108471"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Jean-Hugh Thomas","raw_affiliation_strings":["LAUM - Laboratoire d'Acoustique de l'Universit\u00e9 du Mans (Laboratoire d'Acoustique de l'Universit\u00e9 du Mans, LAUM - UMR 6613 CNRS, Le Mans Universit\u00e9, \r\nAvenue Olivier Messiaen, 72085 LE MANS - France)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"LAUM - Laboratoire d'Acoustique de l'Universit\u00e9 du Mans (Laboratoire d'Acoustique de l'Universit\u00e9 du Mans, LAUM - UMR 6613 CNRS, Le Mans Universit\u00e9, \r\nAvenue Olivier Messiaen, 72085 LE MANS - France)","institution_ids":["https://openalex.org/I1294671590","https://openalex.org/I4210108471"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5036816609"],"corresponding_institution_ids":["https://openalex.org/I1294671590","https://openalex.org/I4210108471"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.07923468,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1620","last_page":"1624"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.8591964244842529},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7651087045669556},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7509317994117737},{"id":"https://openalex.org/keywords/microphone","display_name":"Microphone","score":0.6469402313232422},{"id":"https://openalex.org/keywords/beamforming","display_name":"Beamforming","score":0.6228142380714417},{"id":"https://openalex.org/keywords/microphone-array","display_name":"Microphone array","score":0.5910733342170715},{"id":"https://openalex.org/keywords/transcription","display_name":"Transcription (linguistics)","score":0.5863116383552551},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.552555501461029},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.47525903582572937},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4705560505390167},{"id":"https://openalex.org/keywords/filter-bank","display_name":"Filter bank","score":0.45284923911094666},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.44359853863716125},{"id":"https://openalex.org/keywords/filter","display_name":"Filter (signal processing)","score":0.43218016624450684},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.4237307012081146},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.29473525285720825},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.12755301594734192},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.10222470760345459}],"concepts":[{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.8591964244842529},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7651087045669556},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7509317994117737},{"id":"https://openalex.org/C2778263558","wikidata":"https://www.wikidata.org/wiki/Q46384","display_name":"Microphone","level":3,"score":0.6469402313232422},{"id":"https://openalex.org/C54197355","wikidata":"https://www.wikidata.org/wiki/Q5782992","display_name":"Beamforming","level":2,"score":0.6228142380714417},{"id":"https://openalex.org/C2778806681","wikidata":"https://www.wikidata.org/wiki/Q907293","display_name":"Microphone array","level":4,"score":0.5910733342170715},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.5863116383552551},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.552555501461029},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.47525903582572937},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4705560505390167},{"id":"https://openalex.org/C100515483","wikidata":"https://www.wikidata.org/wiki/Q3268235","display_name":"Filter bank","level":3,"score":0.45284923911094666},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.44359853863716125},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.43218016624450684},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.4237307012081146},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.29473525285720825},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.12755301594734192},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.10222470760345459},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C68115822","wikidata":"https://www.wikidata.org/wiki/Q1068172","display_name":"Sound pressure","level":2,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.21437/interspeech.2024-917","is_oa":false,"landing_page_url":"http://dx.doi.org/10.21437/interspeech.2024-917","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2024","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2406.03251","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.03251","pdf_url":"https://arxiv.org/pdf/2406.03251","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"pmh:oai:HAL:hal-04602289v1","is_oa":true,"landing_page_url":"https://univ-lemans.hal.science/hal-04602289","pdf_url":"https://univ-lemans.hal.science/hal-04602289/document","source":{"id":"https://openalex.org/S4306402512","display_name":"HAL (Le Centre pour la Communication Scientifique Directe)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1294671590","host_organization_name":"Centre National de la Recherche Scientifique","host_organization_lineage":["https://openalex.org/I1294671590"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Interspeech, International Speech Communication Association (ISCA), Sep 2024, Kos / Greece, Greece","raw_type":"Conference papers"},{"id":"doi:10.48550/arxiv.2406.03251","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2406.03251","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2406.03251","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.03251","pdf_url":"https://arxiv.org/pdf/2406.03251","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.5199999809265137}],"awards":[{"id":"https://openalex.org/G1454328438","display_name":null,"funder_award_id":"101007666","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"}],"funders":[{"id":"https://openalex.org/F4320320300","display_name":"European Commission","ror":"https://ror.org/00k4n6c32"},{"id":"https://openalex.org/F4320326256","display_name":"Grand \u00c9quipement National De Calcul Intensif","ror":"https://ror.org/0010d1q40"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4399447990.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W2162158162","https://openalex.org/W4247736853","https://openalex.org/W1493012537","https://openalex.org/W1999004162","https://openalex.org/W2175373321","https://openalex.org/W2125642021","https://openalex.org/W1521049138","https://openalex.org/W2186375278","https://openalex.org/W2938358845"],"abstract_inverted_index":{"Speaker":[0],"Diarization":[1],"(SD)":[2],"aims":[3],"at":[4],"grouping":[5],"speech":[6],"segments":[7],"that":[8],"belong":[9],"to":[10,46,62,72],"the":[11,35,59,64,74,107,124,130,140],"same":[12],"speaker.":[13],"This":[14,66,83],"task":[15],"is":[16,42,103],"required":[17],"in":[18],"many":[19],"speech-processing":[20],"applications,":[21],"such":[22],"as":[23,86,136],"rich":[24],"meeting":[25],"transcription.":[26],"In":[27],"this":[28],"context,":[29],"distant":[30,114],"microphone":[31],"arrays":[32],"usually":[33],"capture":[34],"audio":[36,49],"signal.":[37],"Beamforming,":[38],"i.e.,":[39],"spatial":[40,81],"filtering,":[41],"a":[43,69,77,87],"common":[44],"practice":[45],"process":[47],"multi-microphone":[48],"data.":[50],"However,":[51],"it":[52],"often":[53],"requires":[54],"an":[55],"explicit":[56],"localization":[57],"of":[58,76,79,129],"active":[60],"source":[61],"steer":[63],"filter.":[65],"paper":[67],"proposes":[68],"self-attention-based":[70],"algorithm":[71],"select":[73],"output":[75],"bank":[78],"fixed":[80],"filters.":[82],"method":[84],"serves":[85],"feature":[88],"extractor":[89],"for":[90],"joint":[91],"Voice":[92],"Activity":[93],"(VAD)":[94],"and":[95,117],"Overlapped":[96],"Speech":[97],"Detection":[98],"(OSD).":[99],"The":[100,110,127],"speaker":[101],"diarization":[102],"then":[104],"inferred":[105],"from":[106],"detected":[108],"segments.":[109],"approach":[111],"shows":[112],"convincing":[113],"VAD,":[115],"OSD,":[116],"SD":[118],"performance,":[119],"e.g.":[120],"14.5%":[121],"DER":[122],"on":[123],"AISHELL-4":[125],"dataset.":[126],"analysis":[128],"self-attention":[131],"weights":[132],"demonstrates":[133],"their":[134],"explainability,":[135],"they":[137],"correlate":[138],"with":[139],"speaker's":[141],"angular":[142],"locations.":[143]},"counts_by_year":[],"updated_date":"2026-06-07T08:38:57.713557","created_date":"2024-06-08T00:00:00"}
