{"id":"https://openalex.org/W4385823101","doi":"https://doi.org/10.21437/interspeech.2023-293","title":"Integrating Emotion Recognition with Speech Recognition and Speaker Diarisation for Conversations","display_name":"Integrating Emotion Recognition with Speech Recognition and Speaker Diarisation for Conversations","publication_year":2023,"publication_date":"2023-08-14","ids":{"openalex":"https://openalex.org/W4385823101","doi":"https://doi.org/10.21437/interspeech.2023-293"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2023-293","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2023-293","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"INTERSPEECH 2023","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2308.07145","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5079268196","display_name":"Wen Wu","orcid":"https://orcid.org/0000-0001-8116-7715"},"institutions":[{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Wen Wu","raw_affiliation_strings":["Department of Engineering, University of Cambridge, Cambridge, UK"],"affiliations":[{"raw_affiliation_string":"Department of Engineering, University of Cambridge, Cambridge, UK","institution_ids":["https://openalex.org/I241749"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100460206","display_name":"Chao Zhang","orcid":"https://orcid.org/0000-0002-7730-5131"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chao Zhang","raw_affiliation_strings":["Department of Electrical Engineering, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Electrical Engineering, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5002191410","display_name":"Philip C. Woodland","orcid":"https://orcid.org/0000-0001-9069-0225"},"institutions":[{"id":"https://openalex.org/I241749","display_name":"University of Cambridge","ror":"https://ror.org/013meh722","country_code":"GB","type":"education","lineage":["https://openalex.org/I241749"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Philip C. Woodland","raw_affiliation_strings":["Department of Engineering, University of Cambridge, Cambridge, UK"],"affiliations":[{"raw_affiliation_string":"Department of Engineering, University of Cambridge, Cambridge, UK","institution_ids":["https://openalex.org/I241749"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5079268196"],"corresponding_institution_ids":["https://openalex.org/I241749"],"apc_list":null,"apc_paid":null,"fwci":1.5646,"has_fulltext":true,"cited_by_count":6,"citation_normalized_percentile":{"value":0.83307435,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"3607","last_page":"3611"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9911999702453613,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9911999702453613,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9848999977111816,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9817000031471252,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.8086190223693848},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7987337112426758},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.6776484251022339},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.614549458026886},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5988484025001526},{"id":"https://openalex.org/keywords/conversation","display_name":"Conversation","score":0.5858482718467712},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5252541899681091},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.5225857496261597},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4901893734931946},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4691696763038635},{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.45838090777397156},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4454869031906128},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.44043800234794617},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.34318095445632935},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.07668259739875793},{"id":"https://openalex.org/keywords/communication","display_name":"Communication","score":0.06944367289543152},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.06873717904090881}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.8086190223693848},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7987337112426758},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.6776484251022339},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.614549458026886},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5988484025001526},{"id":"https://openalex.org/C2777200299","wikidata":"https://www.wikidata.org/wiki/Q52943","display_name":"Conversation","level":2,"score":0.5858482718467712},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5252541899681091},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.5225857496261597},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4901893734931946},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4691696763038635},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.45838090777397156},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4454869031906128},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.44043800234794617},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.34318095445632935},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.07668259739875793},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.06944367289543152},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.06873717904090881},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.21437/interspeech.2023-293","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2023-293","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"INTERSPEECH 2023","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2308.07145","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2308.07145","pdf_url":"https://arxiv.org/pdf/2308.07145","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2308.07145","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2308.07145","pdf_url":"https://arxiv.org/pdf/2308.07145","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"score":0.6399999856948853,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4385823101.pdf"},"referenced_works_count":29,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1502400294","https://openalex.org/W2009059481","https://openalex.org/W2125336414","https://openalex.org/W2131774270","https://openalex.org/W2146334809","https://openalex.org/W2726515241","https://openalex.org/W2890964092","https://openalex.org/W2963104701","https://openalex.org/W2973034847","https://openalex.org/W3016400019","https://openalex.org/W3036601975","https://openalex.org/W3048152940","https://openalex.org/W3093501662","https://openalex.org/W3096147072","https://openalex.org/W3096723250","https://openalex.org/W3096963953","https://openalex.org/W3167533889","https://openalex.org/W3195577433","https://openalex.org/W3197580070","https://openalex.org/W3198528147","https://openalex.org/W3209984917","https://openalex.org/W4221161909","https://openalex.org/W4226380987","https://openalex.org/W4286890477","https://openalex.org/W4288009632","https://openalex.org/W4297841503","https://openalex.org/W4319165154","https://openalex.org/W4385245566"],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W2162158162","https://openalex.org/W4247736853","https://openalex.org/W1493012537","https://openalex.org/W1999004162","https://openalex.org/W2175373321","https://openalex.org/W2125642021","https://openalex.org/W1521049138","https://openalex.org/W2938358845","https://openalex.org/W2997340161"],"abstract_inverted_index":{"Although":[0],"automatic":[1,32,101],"emotion":[2,85,106],"recognition":[3,34],"(AER)":[4],"has":[5],"recently":[6],"drawn":[7],"significant":[8],"research":[9],"interest,":[10],"most":[11],"current":[12],"AER":[13,30,98],"studies":[14],"use":[15],"manually":[16],"segmented":[17],"utterances,":[18],"which":[19],"are":[20,47,94],"usually":[21],"unavailable":[22],"for":[23,49],"dialogue":[24],"systems.":[25],"This":[26],"paper":[27],"proposes":[28],"integrating":[29],"with":[31,100,125],"speech":[33,79],"(ASR)":[35],"and":[36,58,81,89,107,133],"speaker":[37,59,90,108],"diarisation":[38],"(SD)":[39],"in":[40],"a":[41,63,70],"jointly-trained":[42],"system.":[43],"Distinct":[44],"output":[45],"layers":[46],"built":[48],"four":[50],"sub-tasks":[51],"including":[52],"AER,":[53,131],"ASR,":[54],"voice":[55],"activity":[56],"detection":[57],"classification":[60,109],"based":[61,103],"on":[62,104,112,130],"shared":[64],"encoder.":[65],"Taking":[66],"the":[67,74,83,113,118],"audio":[68],"of":[69],"conversation":[71],"as":[72],"input,":[73],"integrated":[75],"system":[76,120],"finds":[77],"all":[78],"segments":[80],"transcribes":[82],"corresponding":[84],"classes,":[86],"word":[87],"sequences,":[88],"identities.":[91],"Two":[92],"metrics":[93],"proposed":[95,119],"to":[96],"evaluate":[97],"performance":[99],"segmentation":[102],"time-weighted":[105],"errors.":[110],"Results":[111],"IEMOCAP":[114],"dataset":[115],"show":[116],"that":[117],"consistently":[121],"outperforms":[122],"two":[123],"baselines":[124],"separately":[126],"trained":[127],"single-task":[128],"systems":[129],"ASR":[132],"SD.":[134]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":3}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2023-08-15T00:00:00"}
