{"id":"https://openalex.org/W4372267150","doi":"https://doi.org/10.1109/icassp49357.2023.10096885","title":"Cross-Modal Fusion Techniques for Utterance-Level Emotion Recognition from Text and Speech","display_name":"Cross-Modal Fusion Techniques for Utterance-Level Emotion Recognition from Text and Speech","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372267150","doi":"https://doi.org/10.1109/icassp49357.2023.10096885"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10096885","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096885","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5056746787","display_name":"Jiachen Luo","orcid":null},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Jiachen Luo","raw_affiliation_strings":["Queen Mary University of London,Centre for Digital Music,UK","Centre for Digital Music, Queen Mary University of London, UK"],"affiliations":[{"raw_affiliation_string":"Queen Mary University of London,Centre for Digital Music,UK","institution_ids":["https://openalex.org/I166337079"]},{"raw_affiliation_string":"Centre for Digital Music, Queen Mary University of London, UK","institution_ids":["https://openalex.org/I166337079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058452657","display_name":"Huy Phan","orcid":"https://orcid.org/0000-0003-4096-785X"},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Huy Phan","raw_affiliation_strings":["Amazon Alexa,Cambridge,MA,USA","Amazon Alexa, Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Amazon Alexa,Cambridge,MA,USA","institution_ids":["https://openalex.org/I1311688040"]},{"raw_affiliation_string":"Amazon Alexa, Cambridge, MA, USA","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5111403298","display_name":"Joshua D. Reiss","orcid":null},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Joshua Reiss","raw_affiliation_strings":["Queen Mary University of London,Centre for Digital Music,UK","Centre for Digital Music, Queen Mary University of London, UK"],"affiliations":[{"raw_affiliation_string":"Queen Mary University of London,Centre for Digital Music,UK","institution_ids":["https://openalex.org/I166337079"]},{"raw_affiliation_string":"Centre for Digital Music, Queen Mary University of London, UK","institution_ids":["https://openalex.org/I166337079"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5056746787"],"corresponding_institution_ids":["https://openalex.org/I166337079"],"apc_list":null,"apc_paid":null,"fwci":4.8973,"has_fulltext":false,"cited_by_count":19,"citation_normalized_percentile":{"value":0.95459846,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.7953400611877441},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7283411026000977},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.6967267990112305},{"id":"https://openalex.org/keywords/conversation","display_name":"Conversation","score":0.6788145303726196},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5127483606338501},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5092755556106567},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5000553131103516},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.4933392107486725},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.47594594955444336},{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.4650866389274597},{"id":"https://openalex.org/keywords/expression","display_name":"Expression (computer science)","score":0.45127761363983154},{"id":"https://openalex.org/keywords/communication","display_name":"Communication","score":0.08195114135742188},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.07375147938728333}],"concepts":[{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.7953400611877441},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7283411026000977},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.6967267990112305},{"id":"https://openalex.org/C2777200299","wikidata":"https://www.wikidata.org/wiki/Q52943","display_name":"Conversation","level":2,"score":0.6788145303726196},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5127483606338501},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5092755556106567},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5000553131103516},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.4933392107486725},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.47594594955444336},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.4650866389274597},{"id":"https://openalex.org/C90559484","wikidata":"https://www.wikidata.org/wiki/Q778379","display_name":"Expression (computer science)","level":2,"score":0.45127761363983154},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.08195114135742188},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.07375147938728333},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icassp49357.2023.10096885","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096885","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:qmro.qmul.ac.uk:123456789/97996","is_oa":false,"landing_page_url":"https://qmro.qmul.ac.uk/xmlui/handle/123456789/97996","pdf_url":null,"source":{"id":"https://openalex.org/S4306400530","display_name":"Queen Mary Research Online (Queen Mary University of London)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I166337079","host_organization_name":"Queen Mary University of London","host_organization_lineage":["https://openalex.org/I166337079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Conference Proceeding"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7200000286102295,"display_name":"Quality Education"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320311061","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W2085662862","https://openalex.org/W2146334809","https://openalex.org/W2788776247","https://openalex.org/W2796430037","https://openalex.org/W2805662932","https://openalex.org/W2891359673","https://openalex.org/W2895615590","https://openalex.org/W2899099710","https://openalex.org/W2939574508","https://openalex.org/W2946218857","https://openalex.org/W2963686995","https://openalex.org/W2963710346","https://openalex.org/W2969889150","https://openalex.org/W3007708573","https://openalex.org/W3084624816","https://openalex.org/W3095251683","https://openalex.org/W3098556456","https://openalex.org/W3107951865","https://openalex.org/W3126625480","https://openalex.org/W3145643603","https://openalex.org/W3148040514","https://openalex.org/W3160183718","https://openalex.org/W3161565210","https://openalex.org/W3162589478","https://openalex.org/W3174977508","https://openalex.org/W3183891373","https://openalex.org/W3196711847","https://openalex.org/W3202667537","https://openalex.org/W4206326349","https://openalex.org/W4220829848","https://openalex.org/W4220887861","https://openalex.org/W4221147459","https://openalex.org/W4289329331","https://openalex.org/W4385245566","https://openalex.org/W6739901393","https://openalex.org/W6784350020","https://openalex.org/W6800666689"],"related_works":["https://openalex.org/W2965552754","https://openalex.org/W3198037411","https://openalex.org/W4226120250","https://openalex.org/W4385823311","https://openalex.org/W2367170701","https://openalex.org/W4309568943","https://openalex.org/W4366308393","https://openalex.org/W3024085187","https://openalex.org/W4226239215","https://openalex.org/W4385763807"],"abstract_inverted_index":{"Multimodal":[0],"emotion":[1,97],"recognition":[2],"(MER)":[3],"is":[4,117],"a":[5,33],"fundamental":[6],"complex":[7],"research":[8],"problem":[9],"due":[10],"to":[11,55,119,139],"the":[12,19,60,80,84,106,110,131,150,153,157,161,165,169],"uncertainty":[13],"of":[14,72,109,126],"human":[15,34],"emotional":[16],"expression":[17],"and":[18,26,66,69,77,102,114,123,128,134,144,156],"heterogeneity":[20,61],"gap":[21,62],"between":[22,63],"different":[23],"modalities.":[24],"Audio":[25],"text":[27],"modalities":[28],"are":[29,137],"particularly":[30],"important":[31],"for":[32,48,96],"participant":[35],"in":[36,83],"understanding":[37],"emotions.":[38],"Although":[39],"many":[40],"successful":[41],"attempts":[42],"have":[43],"been":[44],"designed":[45,118],"multimodal":[46,64],"representations":[47],"MER,":[49],"there":[50],"still":[51],"exist":[52],"multiple":[53,73],"challenges":[54],"be":[56],"addressed:":[57],"1)":[58],"bridging":[59],"features":[65],"model":[67,95,140],"inter-":[68,122],"intramodal":[70],"interactions":[71,125],"modalities;":[74],"2)":[75],"effectively":[76],"efficiently":[78],"modeling":[79],"contextual":[81,142],"dynamics":[82],"conversation":[85],"sequence.":[86],"In":[87],"this":[88],"paper,":[89],"we":[90],"propose":[91],"Cross-Modal":[92],"RoBERTa":[93],"(CM-RoBERTa)":[94],"detection":[98],"from":[99],"spoken":[100],"audio":[101,127],"corresponding":[103],"transcripts.":[104],"As":[105],"core":[107],"unit":[108],"CM-RoBERTa,":[111],"parallel":[112],"self-":[113],"cross-":[115],"attention":[116],"dynamically":[120],"capture":[121],"intra-modal":[124],"text.":[129],"Specially,":[130],"mid-level":[132],"fusion":[133],"residual":[135],"module":[136],"employed":[138],"longterm":[141],"dependencies":[143],"learn":[145],"modality-specific":[146],"patterns.":[147],"We":[148],"evaluate":[149],"approach":[151,163],"on":[152,168],"MELD":[154],"dataset":[155],"experimental":[158],"results":[159],"show":[160],"proposed":[162],"achieves":[164],"state-of-art":[166],"performance":[167],"dataset.":[170]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":11},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":1}],"updated_date":"2026-03-25T14:56:36.534964","created_date":"2025-10-10T00:00:00"}
