{"id":"https://openalex.org/W4403713281","doi":"https://doi.org/10.1145/3689092.3689401","title":"Multimodal Emotion Recognition with Vision-language Prompting and Modality Dropout","display_name":"Multimodal Emotion Recognition with Vision-language Prompting and Modality Dropout","publication_year":2024,"publication_date":"2024-10-23","ids":{"openalex":"https://openalex.org/W4403713281","doi":"https://doi.org/10.1145/3689092.3689401"},"language":"en","primary_location":{"id":"doi:10.1145/3689092.3689401","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3689092.3689401","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2nd International Workshop on Multimodal and Responsible Affective Computing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5038123809","display_name":"Anbin Qi","orcid":"https://orcid.org/0009-0009-1015-7494"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Anbin Qi","raw_affiliation_strings":["Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050313677","display_name":"Z. J. Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhongliang Liu","raw_affiliation_strings":["Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024551899","display_name":"Xinyong Zhou","orcid":"https://orcid.org/0000-0002-2774-2106"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xinyong Zhou","raw_affiliation_strings":["Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114035603","display_name":"Jinba Xiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jinba Xiao","raw_affiliation_strings":["Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039706556","display_name":"F Zhang","orcid":"https://orcid.org/0009-0005-9488-5077"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fengrun Zhang","raw_affiliation_strings":["Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103769031","display_name":"Qi Gan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qi Gan","raw_affiliation_strings":["Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066084548","display_name":"Ming Tao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ming Tao","raw_affiliation_strings":["Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018147681","display_name":"Gaozheng Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gaozheng Zhang","raw_affiliation_strings":["Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5110653828","display_name":"Lu Zhang","orcid":"https://orcid.org/0000-0003-2284-8005"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu Zhang","raw_affiliation_strings":["Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Soul AI, Shanghai Soulgate Technology Co., Ltd., Beijing, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5038123809"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.2436,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.89523705,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"49","last_page":"53"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9947999715805054,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/dropout","display_name":"Dropout (neural networks)","score":0.7331892251968384},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.7063073515892029},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5838138461112976},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.44953399896621704},{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.4108952581882477},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3934464752674103},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.33649587631225586},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.33556216955184937},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.10509049892425537}],"concepts":[{"id":"https://openalex.org/C2776145597","wikidata":"https://www.wikidata.org/wiki/Q25339462","display_name":"Dropout (neural networks)","level":2,"score":0.7331892251968384},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.7063073515892029},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5838138461112976},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.44953399896621704},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.4108952581882477},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3934464752674103},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.33649587631225586},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.33556216955184937},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.10509049892425537}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3689092.3689401","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3689092.3689401","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2nd International Workshop on Multimodal and Responsible Affective Computing","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":8,"referenced_works":["https://openalex.org/W1530404542","https://openalex.org/W3205500251","https://openalex.org/W4285595742","https://openalex.org/W4312310776","https://openalex.org/W4312639100","https://openalex.org/W4386065554","https://openalex.org/W4386071547","https://openalex.org/W4387969462"],"related_works":["https://openalex.org/W3082178636","https://openalex.org/W2782041652","https://openalex.org/W2612657834","https://openalex.org/W2392157706","https://openalex.org/W2599192953","https://openalex.org/W1987310671","https://openalex.org/W2952088488","https://openalex.org/W1521968289","https://openalex.org/W3126677997","https://openalex.org/W1610857240"],"abstract_inverted_index":{"In":[0,117],"this":[1,118],"paper,":[2],"we":[3,26,35,81,98,108,120],"present":[4],"our":[5,129,142],"solution":[6],"for":[7,30,49,85,105],"the":[8,18,62,73,103,135,147,156],"Second":[9],"Multimodal":[10,31],"Emotion":[11,32],"Recognition":[12],"Challenge":[13],"Track":[14],"1(MER2024-SEMI).":[15],"To":[16],"enhance":[17],"accuracy":[19,152],"and":[20,131],"generalization":[21],"performance":[22,63],"of":[23,64,75,153],"emotion":[24,51],"recognition,":[25],"propose":[27],"several":[28],"methods":[29],"Recognition.":[33],"Firstly,":[34],"introduce":[36],"EmoVCLIP,":[37],"a":[38,110],"model":[39,130,143],"fine-tuned":[40],"based":[41],"on":[42,58,67,155],"CLIP":[43,66],"using":[44,100],"vision-language":[45],"prompt":[46,56,104],"learning,":[47],"designed":[48],"video-based":[50],"recognition":[52],"tasks.":[53],"By":[54],"leveraging":[55],"learning":[57],"CLIP,":[59],"EmoVCLIP":[60],"improves":[61],"pre-trained":[65],"emotional":[68,96],"videos.":[69,116],"Additionally,":[70],"to":[71,90,113],"address":[72],"issue":[74],"modality":[76,83],"dependence":[77],"in":[78,93,146],"multimodal":[79],"fusion,":[80],"employ":[82],"dropout":[84],"robust":[86],"information":[87],"fusion.":[88],"Furthermore,":[89],"aid":[91],"Baichuan":[92],"better":[94],"extracting":[95],"information,":[97],"suggest":[99],"GPT-4":[101],"as":[102],"Baichuan.":[106],"Lastly,":[107],"utilize":[109],"self-training":[111],"strategy":[112],"leverage":[114],"unlabeled":[115,122],"process,":[119],"use":[121],"videos":[123],"with":[124],"high-confidence":[125],"pseudo-labels":[126],"generated":[127],"by":[128],"incorporate":[132],"them":[133],"into":[134],"training":[136],"set.":[137,158],"Experimental":[138],"results":[139],"demonstrate":[140],"that":[141],"ranks":[144],"1st":[145],"MER2024-SEMI":[148],"track,":[149],"achieving":[150],"an":[151],"90.15%":[154],"test":[157]},"counts_by_year":[{"year":2025,"cited_by_count":9}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
