{"id":"https://openalex.org/W4408352935","doi":"https://doi.org/10.1109/icassp49660.2025.10888172","title":"Multimodal Emotion Recognition in Conversation via Possible Speaker\u2019s Audio and Visual Sequence Selection","display_name":"Multimodal Emotion Recognition in Conversation via Possible Speaker\u2019s Audio and Visual Sequence Selection","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408352935","doi":"https://doi.org/10.1109/icassp49660.2025.10888172"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10888172","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10888172","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5083895545","display_name":"Rahul Singh Maharjan","orcid":"https://orcid.org/0000-0002-3893-2079"},"institutions":[{"id":"https://openalex.org/I28407311","display_name":"University of Manchester","ror":"https://ror.org/027m9bs27","country_code":"GB","type":"education","lineage":["https://openalex.org/I28407311"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Rahul Singh Maharjan","raw_affiliation_strings":["University of Manchester,Manchester Centre for Robotics and AI,Manchester,UK"],"affiliations":[{"raw_affiliation_string":"University of Manchester,Manchester Centre for Robotics and AI,Manchester,UK","institution_ids":["https://openalex.org/I28407311"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086004172","display_name":"Niyati Rawal","orcid":"https://orcid.org/0000-0002-4142-0488"},"institutions":[{"id":"https://openalex.org/I122346577","display_name":"University of Modena and Reggio Emilia","ror":"https://ror.org/02d4c4y02","country_code":"IT","type":"education","lineage":["https://openalex.org/I122346577"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Niyati Rawal","raw_affiliation_strings":["University of Modena and Reggio Emilia,Modena,Italy"],"affiliations":[{"raw_affiliation_string":"University of Modena and Reggio Emilia,Modena,Italy","institution_ids":["https://openalex.org/I122346577"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056837072","display_name":"Marta Romeo","orcid":"https://orcid.org/0000-0003-4438-0255"},"institutions":[{"id":"https://openalex.org/I32062511","display_name":"Heriot-Watt University","ror":"https://ror.org/04mghma93","country_code":"GB","type":"education","lineage":["https://openalex.org/I32062511"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Marta Romeo","raw_affiliation_strings":["Heriot-Watt University,Department of Computer Science,Edinburgh,UK"],"affiliations":[{"raw_affiliation_string":"Heriot-Watt University,Department of Computer Science,Edinburgh,UK","institution_ids":["https://openalex.org/I32062511"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048928616","display_name":"Lorenzo Baraldi","orcid":"https://orcid.org/0000-0001-5125-4957"},"institutions":[{"id":"https://openalex.org/I122346577","display_name":"University of Modena and Reggio Emilia","ror":"https://ror.org/02d4c4y02","country_code":"IT","type":"education","lineage":["https://openalex.org/I122346577"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Lorenzo Baraldi","raw_affiliation_strings":["University of Modena and Reggio Emilia,Modena,Italy"],"affiliations":[{"raw_affiliation_string":"University of Modena and Reggio Emilia,Modena,Italy","institution_ids":["https://openalex.org/I122346577"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030948871","display_name":"Rita Cucchiara","orcid":"https://orcid.org/0000-0002-2239-283X"},"institutions":[{"id":"https://openalex.org/I122346577","display_name":"University of Modena and Reggio Emilia","ror":"https://ror.org/02d4c4y02","country_code":"IT","type":"education","lineage":["https://openalex.org/I122346577"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Rita Cucchiara","raw_affiliation_strings":["University of Modena and Reggio Emilia,Modena,Italy"],"affiliations":[{"raw_affiliation_string":"University of Modena and Reggio Emilia,Modena,Italy","institution_ids":["https://openalex.org/I122346577"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5091768977","display_name":"Angelo Cangelosi","orcid":"https://orcid.org/0000-0002-4709-2243"},"institutions":[{"id":"https://openalex.org/I28407311","display_name":"University of Manchester","ror":"https://ror.org/027m9bs27","country_code":"GB","type":"education","lineage":["https://openalex.org/I28407311"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Angelo Cangelosi","raw_affiliation_strings":["University of Manchester,Manchester Centre for Robotics and AI,Manchester,UK"],"affiliations":[{"raw_affiliation_string":"University of Manchester,Manchester Centre for Robotics and AI,Manchester,UK","institution_ids":["https://openalex.org/I28407311"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5083895545"],"corresponding_institution_ids":["https://openalex.org/I28407311"],"apc_list":null,"apc_paid":null,"fwci":8.2536,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.96964414,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.8382999897003174,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.8382999897003174,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.7368999719619751,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/conversation","display_name":"Conversation","score":0.8179895877838135},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7768964767456055},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7642483711242676},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.6462064385414124},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.6155298352241516},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.5312761068344116},{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.4949457049369812},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.47429993748664856},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.43380531668663025},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3971211910247803},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.24587830901145935},{"id":"https://openalex.org/keywords/communication","display_name":"Communication","score":0.22517845034599304},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.1373247504234314}],"concepts":[{"id":"https://openalex.org/C2777200299","wikidata":"https://www.wikidata.org/wiki/Q52943","display_name":"Conversation","level":2,"score":0.8179895877838135},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7768964767456055},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7642483711242676},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.6462064385414124},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.6155298352241516},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.5312761068344116},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.4949457049369812},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.47429993748664856},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.43380531668663025},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3971211910247803},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.24587830901145935},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.22517845034599304},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.1373247504234314},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icassp49660.2025.10888172","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10888172","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:iris.unimore.it:11380/1365888","is_oa":false,"landing_page_url":"https://hdl.handle.net/11380/1365888","pdf_url":null,"source":{"id":"https://openalex.org/S4306400718","display_name":"IRIS UNIMORE (University of Modena and Reggio Emilia)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I122346577","host_organization_name":"University of Modena and Reggio Emilia","host_organization_lineage":["https://openalex.org/I122346577"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"info:eu-repo/semantics/conferenceObject"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W2146334809","https://openalex.org/W2164598857","https://openalex.org/W2740550900","https://openalex.org/W2951583236","https://openalex.org/W2963686995","https://openalex.org/W2964300796","https://openalex.org/W2965453734","https://openalex.org/W2981873476","https://openalex.org/W2982108874","https://openalex.org/W3034999214","https://openalex.org/W3117369308","https://openalex.org/W4221147459","https://openalex.org/W4221154966","https://openalex.org/W4385245566","https://openalex.org/W4385570058","https://openalex.org/W4385573848","https://openalex.org/W4386076638","https://openalex.org/W4407926395","https://openalex.org/W6776148200","https://openalex.org/W6788335241","https://openalex.org/W6798523648","https://openalex.org/W6847363464","https://openalex.org/W6903887316"],"related_works":["https://openalex.org/W2899084033","https://openalex.org/W1968552888","https://openalex.org/W2374116601","https://openalex.org/W3093134843","https://openalex.org/W1511346092","https://openalex.org/W1527532029","https://openalex.org/W3006475563","https://openalex.org/W3015707499","https://openalex.org/W4287868249","https://openalex.org/W2326728821"],"abstract_inverted_index":{"Multimodal":[0,141],"Emotion":[1,142],"Recognition":[2,143],"in":[3,10,29,68,118,144,164],"Conversation":[4,145],"(MERC)":[5],"is":[6,106,149],"an":[7],"important":[8],"element":[9],"human-machine":[11],"interaction.":[12],"It":[13],"allows":[14],"machines":[15],"to":[16,61,152],"automatically":[17],"identify":[18],"and":[19,39,56,77,90,156,178],"track":[20],"the":[21,34,54,63,84,107,112,137,165,174,183],"emotional":[22],"status":[23],"of":[24,65,114,160,167,185],"speakers":[25],"during":[26,128],"a":[27,30,96,161,199],"conversation":[28,97],"multimodal":[31],"setting.":[32],"However,":[33,95],"conversations":[35],"involving":[36],"various":[37],"audio":[38,55,88,154],"visual":[40,57,78,157],"cues":[41,45],"aligned":[42],"with":[43,59,192,198],"textual":[44,60,86],"are":[46,93],"very":[47],"complex.":[48],"Recent":[49],"works":[50],"have":[51],"tried":[52],"integrating":[53],"modalities":[58,124],"improve":[62],"performance":[64,191],"emotion":[66],"recognition":[67],"conversation.":[69],"Although":[70],"many":[71,119],"MERC":[72,110],"models":[73,81,195],"leverage":[74],"textual,":[75],"audio,":[76],"modalities,":[79,116],"those":[80],"assume":[82],"that":[83],"speaker\u2019s":[85],"utterance,":[87],"speech,":[89],"facial":[91],"sequences":[92,159],"present.":[94],"may":[98,125],"contain":[99],"multiple":[100,168],"parties,":[101],"among":[102],"which":[103],"only":[104],"one":[105,121],"speaker.":[108],"Previous":[109],"assumed":[111],"availability":[113],"all":[115],"but":[117],"instances,":[120],"or":[122],"more":[123],"be":[126],"unavailable":[127],"multiparty":[129],"conversations.":[130],"To":[131],"tackle":[132],"these":[133],"issues,":[134],"we":[135],"propose":[136],"Possible":[138],"Speaker":[139],"Informed":[140],"framework":[146],"(PSI).":[147],"PSI":[148,171,188],"specifically":[150],"tasked":[151],"extract":[153],"(speech)":[155],"(face)":[158],"possible":[162],"speaker":[163],"presence":[166],"parties.":[169],"Further,":[170],"seamlessly":[172],"extracts":[173],"rich":[175],"unimodal":[176],"features":[177],"fuses":[179],"them":[180],"while":[181],"addressing":[182],"unavailability":[184],"specific":[186],"modalities.":[187],"demonstrates":[189],"competitive":[190],"existing":[193],"state-of-the-art":[194],"through":[196],"experiments":[197],"benchmark":[200],"dataset.":[201]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":2}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
