{"id":"https://openalex.org/W2804758275","doi":"https://doi.org/10.21437/interspeech.2018-1364","title":"Multimodal Speaker Segmentation and Diarization Using Lexical and Acoustic Cues via Sequence to Sequence Neural Networks","display_name":"Multimodal Speaker Segmentation and Diarization Using Lexical and Acoustic Cues via Sequence to Sequence Neural Networks","publication_year":2018,"publication_date":"2018-08-28","ids":{"openalex":"https://openalex.org/W2804758275","doi":"https://doi.org/10.21437/interspeech.2018-1364","mag":"2804758275"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2018-1364","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2018-1364","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2018","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/1805.10731","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101397880","display_name":"Tae Jin Park","orcid":null},"institutions":[{"id":"https://openalex.org/I1174212","display_name":"University of Southern California","ror":"https://ror.org/03taz7m60","country_code":"US","type":"education","lineage":["https://openalex.org/I1174212"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Tae Jin Park","raw_affiliation_strings":["University of Southern California, Los Angeles, United States"],"affiliations":[{"raw_affiliation_string":"University of Southern California, Los Angeles, United States","institution_ids":["https://openalex.org/I1174212"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5021678540","display_name":"Panayiotis Georgiou","orcid":"https://orcid.org/0000-0002-0790-7161"},"institutions":[{"id":"https://openalex.org/I1174212","display_name":"University of Southern California","ror":"https://ror.org/03taz7m60","country_code":"US","type":"education","lineage":["https://openalex.org/I1174212"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Panayiotis Georgiou","raw_affiliation_strings":["University of Southern California, Los Angeles, United States"],"affiliations":[{"raw_affiliation_string":"University of Southern California, Los Angeles, United States","institution_ids":["https://openalex.org/I1174212"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5101397880"],"corresponding_institution_ids":["https://openalex.org/I1174212"],"apc_list":null,"apc_paid":null,"fwci":0.5077,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.73271717,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1373","last_page":"1377"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9835000038146973,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9606999754905701,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.7861433029174805},{"id":"https://openalex.org/keywords/mel-frequency-cepstrum","display_name":"Mel-frequency cepstrum","score":0.7840845584869385},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7724959850311279},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7656406164169312},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5129947066307068},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.5112466812133789},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5049052834510803},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.5020840167999268},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.47788846492767334},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.4661869704723358},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4480922818183899},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.44253110885620117},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.34132301807403564},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.32921522855758667},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.06775036454200745}],"concepts":[{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.7861433029174805},{"id":"https://openalex.org/C151989614","wikidata":"https://www.wikidata.org/wiki/Q440370","display_name":"Mel-frequency cepstrum","level":3,"score":0.7840845584869385},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7724959850311279},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7656406164169312},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5129947066307068},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.5112466812133789},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5049052834510803},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.5020840167999268},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.47788846492767334},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.4661869704723358},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4480922818183899},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.44253110885620117},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.34132301807403564},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.32921522855758667},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.06775036454200745},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.21437/interspeech.2018-1364","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2018-1364","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2018","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:1805.10731","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1805.10731","pdf_url":"https://arxiv.org/pdf/1805.10731","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"mag:2804758275","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/1805.10731.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.1805.10731","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.1805.10731","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:1805.10731","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1805.10731","pdf_url":"https://arxiv.org/pdf/1805.10731","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.6299999952316284}],"awards":[{"id":"https://openalex.org/G4066213532","display_name":null,"funder_award_id":"W81XWH-15-1-0632","funder_id":"https://openalex.org/F4320306078","funder_display_name":"U.S. Department of Defense"},{"id":"https://openalex.org/G4119534169","display_name":null,"funder_award_id":"W81XWH","funder_id":"https://openalex.org/F4320306078","funder_display_name":"U.S. Department of Defense"},{"id":"https://openalex.org/G4294856372","display_name":null,"funder_award_id":"W81XWH-15-1","funder_id":"https://openalex.org/F4320306078","funder_display_name":"U.S. Department of Defense"},{"id":"https://openalex.org/G7139631174","display_name":null,"funder_award_id":"21702-5014","funder_id":"https://openalex.org/F4320306078","funder_display_name":"U.S. Department of Defense"}],"funders":[{"id":"https://openalex.org/F4320306078","display_name":"U.S. Department of Defense","ror":"https://ror.org/0447fe631"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2804758275.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2963227667","https://openalex.org/W2890825077","https://openalex.org/W135211875","https://openalex.org/W2130708324","https://openalex.org/W1975884129","https://openalex.org/W2938358845","https://openalex.org/W1658556320","https://openalex.org/W3033627755","https://openalex.org/W3007946293","https://openalex.org/W1485877426","https://openalex.org/W2774196182","https://openalex.org/W7559903","https://openalex.org/W2057668940","https://openalex.org/W2807391387","https://openalex.org/W2673722796","https://openalex.org/W2043697776","https://openalex.org/W3204618061","https://openalex.org/W2968071640","https://openalex.org/W3081170851","https://openalex.org/W2743902494"],"abstract_inverted_index":{"While":[0,162],"there":[1,12],"has":[2],"been":[3],"substantial":[4],"amount":[5],"of":[6,152,174],"work":[7],"in":[8,16],"speaker":[9,24,31,59,66,74,128],"diarization":[10,32,129],"recently,":[11],"are":[13,91],"few":[14],"efforts":[15],"jointly":[17],"employing":[18],"lexical":[19,42,88,109,139],"and":[20,43,123],"acoustic":[21,44,85],"information":[22,89,106],"for":[23,54,72],"segmentation.":[25],"Towards":[26],"that,":[27],"we":[28,100,148],"investigate":[29],"a":[30,35,49],"system":[33,117,134],"using":[34],"sequence-to-sequence":[36,116],"neural":[37],"network":[38],"trained":[39,118],"on":[40,119,127,138,165,184],"both":[41,120],"features.":[45],"We":[46,76],"also":[47,63],"propose":[48],"loss":[50],"function":[51],"that":[52,90,102,115,135],"allows":[53],"selecting":[55],"not":[56],"only":[57,136],"the":[58,64,96,108,133,142,150,163,169,180],"change":[60],"points":[61],"but":[62],"best":[65],"at":[67],"any":[68],"time":[69],"by":[70],"allowing":[71],"different":[73],"groupings.":[75],"incorporate":[77],"Mel":[78],"Frequency":[79],"Cepstral":[80],"Coefficients":[81],"(MFCC)":[82],"as":[83],"an":[84],"feature":[86],"alongside":[87],"obtained":[92],"from":[93,95],"conversations":[94],"Fisher":[97],"dataset.":[98],"Thus,":[99],"show":[101,114],"acoustics":[103],"provide":[104],"complementary":[105],"to":[107,132],"modality.":[110],"The":[111],"experimental":[112],"results":[113],"word":[121],"sequences":[122],"MFCC":[124],"can":[125],"improve":[126],"result":[130],"compared":[131],"relies":[137],"modality":[140],"or":[141],"baseline":[143],"MFCC-based":[144],"system.":[145],"In":[146],"addition,":[147],"test":[149],"performance":[151,164],"our":[153,175],"proposed":[154,176],"method":[155,177,182],"with":[156],"Automatic":[157],"Speech":[158],"Recognition":[159],"(ASR)":[160],"transcripts.":[161],"ASR":[166],"transcripts":[167],"drops,":[168],"Diarization":[170],"Error":[171],"Rate":[172],"(DER)":[173],"still":[178],"outperforms":[179],"traditional":[181],"based":[183],"Bayesian":[185],"Information":[186],"Criterion":[187],"(BIC).":[188]},"counts_by_year":[{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":2},{"year":2020,"cited_by_count":1}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
