{"id":"https://openalex.org/W7155558957","doi":"https://doi.org/10.48550/arxiv.2604.21507","title":"DiariZen Explained: A Tutorial for the Open Source State-of-the-Art Speaker Diarization Pipeline","display_name":"DiariZen Explained: A Tutorial for the Open Source State-of-the-Art Speaker Diarization Pipeline","publication_year":2026,"publication_date":"2026-04-23","ids":{"openalex":"https://openalex.org/W7155558957","doi":"https://doi.org/10.48550/arxiv.2604.21507"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.21507","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.21507","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.21507","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5094230049","display_name":"Nikhil Raghav","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Raghav, Nikhil","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5094230049"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8400999903678894,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8400999903678894,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.032099999487400055,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.013899999670684338,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.7601000070571899},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.7182999849319458},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5376999974250793},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5353000164031982},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.5274999737739563},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5245000123977661},{"id":"https://openalex.org/keywords/scripting-language","display_name":"Scripting language","score":0.4812000095844269},{"id":"https://openalex.org/keywords/executable","display_name":"Executable","score":0.4471000134944916},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.43959999084472656}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.815500020980835},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.7601000070571899},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.7182999849319458},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5376999974250793},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5353000164031982},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.5274999737739563},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5245000123977661},{"id":"https://openalex.org/C61423126","wikidata":"https://www.wikidata.org/wiki/Q187432","display_name":"Scripting language","level":2,"score":0.4812000095844269},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.45100000500679016},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.4471000134944916},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.43959999084472656},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.4316999912261963},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.4300999939441681},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3991999924182892},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3903999924659729},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.3804999887943268},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.34940001368522644},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.34769999980926514},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.3163999915122986},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.27619999647140503},{"id":"https://openalex.org/C165464430","wikidata":"https://www.wikidata.org/wiki/Q1570441","display_name":"Parameterized complexity","level":2,"score":0.2678000032901764},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.2660999894142151},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.26350000500679016},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2630999982357025},{"id":"https://openalex.org/C2780154230","wikidata":"https://www.wikidata.org/wiki/Q513420","display_name":"Undo","level":2,"score":0.2624000012874603},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.25679999589920044},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.2567000091075897},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2547000050544739},{"id":"https://openalex.org/C8038995","wikidata":"https://www.wikidata.org/wiki/Q1152135","display_name":"Unsupervised learning","level":2,"score":0.25130000710487366},{"id":"https://openalex.org/C2778751112","wikidata":"https://www.wikidata.org/wiki/Q835016","display_name":"Window (computing)","level":2,"score":0.2500999867916107}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.21507","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.21507","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.21507","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.21507","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Speaker":[0],"diarization":[1,41],"(SD)":[2],"is":[3,205],"the":[4,66,71,74,85,105,119,178,191,199,223],"task":[5],"of":[6,22,70,76,118,190],"answering":[7],"\"who":[8],"spoke":[9],"when\"":[10],"in":[11,36],"a":[12,45,51,56,108,114,195,218],"multi-speaker":[13],"audio":[14,129],"stream.":[15],"Classically,":[16],"an":[17,26],"SD":[18,37,47],"system":[19,106],"clusters":[20],"segments":[21],"speech":[23],"belonging":[24],"to":[25,100],"individual":[27],"speaker's":[28],"identity.":[29],"Recent":[30],"years":[31],"have":[32],"seen":[33],"substantial":[34],"progress":[35],"through":[38],"end-to-end":[39],"neural":[40],"(EEND)":[42],"approaches.":[43],"DiariZen,":[44],"hybrid":[46],"pipeline":[48,225],"built":[49],"upon":[50],"structurally":[52],"pruned":[53],"WavLM-Large":[54],"encoder,":[55],"Conformer":[57,144],"backend":[58,145],"with":[59,139,158,164],"powerset":[60,147],"classification,":[61,148],"and":[62,91,98,131,146,167,170,187,217],"VBx":[63,162],"clustering,":[64],"represents":[65],"leading":[67],"open-source":[68],"state":[69],"art":[72],"at":[73,207],"time":[75],"writing":[77],"across":[78],"multiple":[79],"benchmarks.":[80],"Despite":[81],"its":[82],"strong":[83],"performance,":[84],"DiariZen":[86,121],"architecture":[87],"spans":[88],"several":[89],"repositories":[90],"frameworks,":[92],"making":[93],"it":[94,124],"difficult":[95],"for":[96,214],"researchers":[97],"practitioners":[99],"understand,":[101],"reproduce,":[102],"or":[103],"extend":[104],"as":[107],"whole.":[109],"This":[110],"tutorial":[111],"paper":[112],"provides":[113],"self-contained,":[115],"block-by-block":[116],"explanation":[117],"complete":[120,224],"pipeline,":[122],"decomposing":[123],"into":[125],"seven":[126],"stages:":[127],"(1)":[128],"loading":[130],"sliding":[132],"window":[133],"segmentation,":[134],"(2)":[135],"WavLM":[136],"feature":[137],"extraction":[138,157],"learned":[140],"layer":[141],"weighting,":[142],"(3)":[143],"(4)":[149],"segmentation":[150],"aggregation":[151],"via":[152],"overlap-add,":[153],"(5)":[154],"speaker":[155],"embedding":[156],"overlap":[159],"exclusion,":[160],"(6)":[161],"clustering":[163],"PLDA":[165],"scoring,":[166],"(7)":[168],"reconstruction":[169],"RTTM":[171],"output.":[172],"For":[173],"each":[174,215],"block,":[175],"we":[176],"provide":[177],"conceptual":[179],"motivation,":[180],"source":[181],"code":[182],"references,":[183],"intermediate":[184],"tensor":[185],"shapes,":[186],"annotated":[188],"visualizations":[189],"actual":[192],"outputs":[193],"on":[194],"30s":[196],"excerpt":[197],"from":[198],"AMI":[200],"Meeting":[201],"Corpus.":[202],"The":[203],"implementation":[204],"available":[206],"https://github.com/nikhilraghav29/diarizen-tutorial,":[208],"which":[209],"includes":[210],"standalone":[211],"executable":[212],"scripts":[213],"block":[216],"Jupyter":[219],"notebook":[220],"that":[221],"runs":[222],"end-to-end.":[226]},"counts_by_year":[],"updated_date":"2026-04-25T06:06:54.107920","created_date":"2026-04-25T00:00:00"}
