{"id":"https://openalex.org/W4392902855","doi":"https://doi.org/10.1109/icassp48485.2024.10446204","title":"Enhancing Speaker Diarization with Large Language Models: A Contextual Beam Search Approach","display_name":"Enhancing Speaker Diarization with Large Language Models: A Contextual Beam Search Approach","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392902855","doi":"https://doi.org/10.1109/icassp48485.2024.10446204"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446204","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446204","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101397880","display_name":"Tae Jin Park","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Tae Jin Park","raw_affiliation_strings":["NVIDIA,Santa Clara,USA","NVIDIA, Santa Clara, USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,USA","institution_ids":["https://openalex.org/I4210127875"]},{"raw_affiliation_string":"NVIDIA, Santa Clara, USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073181369","display_name":"Kunal Dhawan","orcid":"https://orcid.org/0000-0002-5276-2475"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kunal Dhawan","raw_affiliation_strings":["NVIDIA,Santa Clara,USA","NVIDIA, Santa Clara, USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,USA","institution_ids":["https://openalex.org/I4210127875"]},{"raw_affiliation_string":"NVIDIA, Santa Clara, USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040007143","display_name":"Nithin Rao Koluguri","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nithin Koluguri","raw_affiliation_strings":["NVIDIA,Santa Clara,USA","NVIDIA, Santa Clara, USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,USA","institution_ids":["https://openalex.org/I4210127875"]},{"raw_affiliation_string":"NVIDIA, Santa Clara, USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5040747392","display_name":"Jagadeesh Balam","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jagadeesh Balam","raw_affiliation_strings":["NVIDIA,Santa Clara,USA","NVIDIA, Santa Clara, USA"],"affiliations":[{"raw_affiliation_string":"NVIDIA,Santa Clara,USA","institution_ids":["https://openalex.org/I4210127875"]},{"raw_affiliation_string":"NVIDIA, Santa Clara, USA","institution_ids":["https://openalex.org/I4210127875"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5101397880"],"corresponding_institution_ids":["https://openalex.org/I4210127875"],"apc_list":null,"apc_paid":null,"fwci":2.9009,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.91459724,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"10861","last_page":"10865"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.875031590461731},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7774475812911987},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6097027659416199},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5474106669425964},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5431011319160461},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5320974588394165},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.5179868936538696},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4682990610599518},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.45869913697242737},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.429311603307724},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.38705891370773315},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.3665410280227661},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.08531194925308228}],"concepts":[{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.875031590461731},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7774475812911987},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6097027659416199},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5474106669425964},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5431011319160461},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5320974588394165},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.5179868936538696},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4682990610599518},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.45869913697242737},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.429311603307724},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38705891370773315},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.3665410280227661},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.08531194925308228},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446204","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446204","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5099999904632568,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2127141656","https://openalex.org/W2130266733","https://openalex.org/W2144499799","https://openalex.org/W2167241364","https://openalex.org/W2405472765","https://openalex.org/W2902864383","https://openalex.org/W2906383034","https://openalex.org/W2949676527","https://openalex.org/W2963227667","https://openalex.org/W2973127116","https://openalex.org/W2973727699","https://openalex.org/W2974231335","https://openalex.org/W2997419692","https://openalex.org/W3020336359","https://openalex.org/W3025260599","https://openalex.org/W3097777922","https://openalex.org/W3105031100","https://openalex.org/W3174269049","https://openalex.org/W3178462146","https://openalex.org/W3203417382","https://openalex.org/W3204618061","https://openalex.org/W3205878676","https://openalex.org/W3212886388","https://openalex.org/W4224329333","https://openalex.org/W4225890157","https://openalex.org/W4385245566","https://openalex.org/W4385570646","https://openalex.org/W4385822373","https://openalex.org/W4385823385","https://openalex.org/W4389315128","https://openalex.org/W4389315129","https://openalex.org/W6679855610","https://openalex.org/W6767671539","https://openalex.org/W6767997687"],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W2149220986","https://openalex.org/W1493012537","https://openalex.org/W4247736853","https://openalex.org/W2162158162","https://openalex.org/W1999004162","https://openalex.org/W2125642021","https://openalex.org/W1521049138","https://openalex.org/W2023466863","https://openalex.org/W2696990509"],"abstract_inverted_index":{"Large":[0],"language":[1,14],"models":[2,117],"(LLMs)":[3],"have":[4],"shown":[5],"great":[6],"promise":[7],"for":[8,118],"capturing":[9,186],"contextual":[10,33,152,189],"information":[11,50,114,153],"in":[12,35,54],"natural":[13],"processing":[15],"tasks.":[16],"We":[17,58],"propose":[18],"a":[19],"novel":[20],"approach":[21,129],"to":[22,31,73,115,132,150,157,171,177],"speaker":[23,44,120,164,179],"diarization":[24,45,95,121,180],"that":[25,85,109,144,154],"incorporates":[26],"the":[27,55,60,90,98,119,124,138,145,172],"prowess":[28],"of":[29,174],"LLMs":[30,110,176],"exploit":[32,151],"cues":[34,75],"human":[36],"dialogues.":[37],"Our":[38,82],"method":[39],"builds":[40],"upon":[41],"an":[42,52,93],"acoustic-based":[43],"system":[46,96],"by":[47,163,185],"adding":[48],"lexical":[49,70,87],"from":[51,76,89,137],"LLM":[53,91],"inference":[56],"stage.":[57],"model":[59],"multi-modal":[61],"decoding":[62,128],"process":[63],"probabilistically":[64],"and":[65,69,80,181,188],"perform":[66],"joint":[67],"acoustic":[68,116],"beam":[71,126],"searches":[72],"incorporate":[74],"both":[77],"modalities:":[78],"audio":[79],"text.":[81],"experiments":[83],"demonstrate":[84],"infusing":[86],"knowledge":[88],"into":[92],"acoustics-only":[94,158],"improves":[97],"overall":[99],"speaker-attributed":[100],"word":[101],"error":[102],"rate":[103],"(SA-WER).":[104],"The":[105],"experimental":[106],"results":[107],"show":[108],"can":[111],"provide":[112],"complementary":[113],"task":[122],"via":[123],"proposed":[125,146],"search":[127],"showing":[130],"up":[131],"39.8%":[133],"relative":[134],"delta-SA-WER":[135],"improvement":[136],"baseline":[139],"system.":[140],"Thus,":[141],"we":[142],"substantiate":[143],"technique":[147],"is":[148,155,161],"able":[149],"inaccessible":[156],"systems":[159],"which":[160],"represented":[162],"embeddings.":[165],"In":[166],"addition,":[167],"these":[168],"findings":[169],"point":[170],"potential":[173],"using":[175],"improve":[178],"other":[182],"speech-processing":[183],"tasks":[184],"semantic":[187],"cues.":[190]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
