{"id":"https://openalex.org/W4410088807","doi":"https://doi.org/10.1145/3696410.3714739","title":"Towards Multimodal Empathetic Response Generation: A Rich Text-Speech-Vision Avatar-based Benchmark","display_name":"Towards Multimodal Empathetic Response Generation: A Rich Text-Speech-Vision Avatar-based Benchmark","publication_year":2025,"publication_date":"2025-04-22","ids":{"openalex":"https://openalex.org/W4410088807","doi":"https://doi.org/10.1145/3696410.3714739"},"language":"en","primary_location":{"id":"doi:10.1145/3696410.3714739","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3696410.3714739","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3696410.3714739","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3696410.3714739","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100399305","display_name":"Han Zhang","orcid":"https://orcid.org/0009-0000-2090-374X"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Han Zhang","raw_affiliation_strings":["School of Electronic Engineering, Xidian University, Xi'an, Shannxi, China"],"affiliations":[{"raw_affiliation_string":"School of Electronic Engineering, Xidian University, Xi'an, Shannxi, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100597444","display_name":"Zixiang Meng","orcid":null},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zixiang Meng","raw_affiliation_strings":["School of Cyber Science and Engineering, Wuhan University, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"School of Cyber Science and Engineering, Wuhan University, Wuhan, China","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100634637","display_name":"Meng Luo","orcid":"https://orcid.org/0000-0003-2274-5719"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Meng Luo","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058516510","display_name":"Hong Han","orcid":"https://orcid.org/0000-0002-8019-3740"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hong Han","raw_affiliation_strings":["School of Electronic Engineering, Xidian University, Xi'an, Shaanxi, China"],"affiliations":[{"raw_affiliation_string":"School of Electronic Engineering, Xidian University, Xi'an, Shaanxi, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081165986","display_name":"Lizi Liao","orcid":"https://orcid.org/0000-0002-9973-3305"},"institutions":[{"id":"https://openalex.org/I79891267","display_name":"Singapore Management University","ror":"https://ror.org/050qmg959","country_code":"SG","type":"education","lineage":["https://openalex.org/I79891267"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Lizi Liao","raw_affiliation_strings":["Singapore Management University, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Singapore Management University, Singapore, Singapore","institution_ids":["https://openalex.org/I79891267"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100752356","display_name":"Erik Cambria","orcid":"https://orcid.org/0000-0002-3030-1280"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Erik Cambria","raw_affiliation_strings":["Nanyang Technological University, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5055815455","display_name":"Hao Fei","orcid":null},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Hao Fei","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100399305"],"corresponding_institution_ids":["https://openalex.org/I149594827"],"apc_list":null,"apc_paid":null,"fwci":22.6649,"has_fulltext":true,"cited_by_count":8,"citation_normalized_percentile":{"value":0.99266499,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"2872","last_page":"2881"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12128","display_name":"AI in Service Interactions","score":0.9901999831199646,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12128","display_name":"AI in Service Interactions","score":0.9901999831199646,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9897000193595886,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9876000285148621,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/avatar","display_name":"Avatar","score":0.9343470335006714},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7353981733322144},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7119433879852295},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.5210689902305603},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.485434353351593},{"id":"https://openalex.org/keywords/multimodal-interaction","display_name":"Multimodal interaction","score":0.46607279777526855},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.42913779616355896},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3478306531906128},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.3450632393360138},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.33496835827827454}],"concepts":[{"id":"https://openalex.org/C2777365542","wikidata":"https://www.wikidata.org/wiki/Q83090","display_name":"Avatar","level":2,"score":0.9343470335006714},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7353981733322144},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7119433879852295},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.5210689902305603},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.485434353351593},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.46607279777526855},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42913779616355896},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3478306531906128},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.3450632393360138},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.33496835827827454},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3696410.3714739","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3696410.3714739","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3696410.3714739","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3696410.3714739","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3696410.3714739","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3696410.3714739","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4410088807.pdf","grobid_xml":"https://content.openalex.org/works/W4410088807.grobid-xml"},"referenced_works_count":17,"referenced_works":["https://openalex.org/W2134963900","https://openalex.org/W2159269332","https://openalex.org/W2187089797","https://openalex.org/W2997087088","https://openalex.org/W3081168214","https://openalex.org/W3200833038","https://openalex.org/W4283798957","https://openalex.org/W4307079201","https://openalex.org/W4320018808","https://openalex.org/W4385571452","https://openalex.org/W4392384245","https://openalex.org/W4395470960","https://openalex.org/W4401042155","https://openalex.org/W4402670985","https://openalex.org/W4402671597","https://openalex.org/W4402727574","https://openalex.org/W4403791424"],"related_works":["https://openalex.org/W3138471234","https://openalex.org/W4247958311","https://openalex.org/W4396832849","https://openalex.org/W1584662471","https://openalex.org/W2785089443","https://openalex.org/W2265117524","https://openalex.org/W1467576422","https://openalex.org/W4220730560","https://openalex.org/W2969390373","https://openalex.org/W2548809491"],"abstract_inverted_index":{"Empathetic":[0],"Response":[1],"Generation":[2],"(ERG)":[3],"is":[4,31],"one":[5],"of":[6,10,99,107,154,161],"the":[7,11,35,159],"key":[8],"tasks":[9],"affective":[12],"computing":[13],"area,":[14],"which":[15,79],"aims":[16],"to":[17,24,34],"produce":[18],"emotionally":[19],"nuanced":[20],"and":[21,66,90,102,132,148,164,189,193],"compassionate":[22],"responses":[23],"user's":[25],"queries.":[26],"However,":[27],"existing":[28],"ERG":[29,59,83,188],"research":[30],"predominantly":[32],"confined":[33],"singleton":[36],"text":[37,82],"modality,":[38],"limiting":[39],"its":[40],"effectiveness":[41],"since":[42],"human":[43,87],"emotions":[44],"are":[45,195],"inherently":[46],"conveyed":[47],"through":[48],"multiple":[49],"modalities.":[50,169],"To":[51],"combat":[52],"this,":[53],"we":[54,111,150],"introduce":[55],"an":[56],"avatar-based":[57],"Multimodal":[58,123],"(MERG)":[60],"task,":[61],"entailing":[62],"rich":[63],"text,":[64],"speech,":[65],"facial":[67],"vision":[68],"information.":[69],"We":[70],"first":[71],"present":[72],"a":[73,96,114,122,152],"large-scale":[74],"high-quality":[75],"benchmark":[76],"dataset,":[77],"AvaMERG,":[78],"extends":[80],"traditional":[81],"by":[84],"incorporating":[85],"authentic":[86],"speech":[88,131],"audio":[89],"dynamic":[91],"talking-face":[92],"avatar":[93,100,133],"videos,":[94],"encompassing":[95],"diverse":[97],"range":[98],"profiles":[101],"broadly":[103],"covering":[104],"various":[105],"topics":[106],"real-world":[108],"scenarios.":[109],"Further,":[110],"deliberately":[112],"tailor":[113],"system,":[115],"named":[116],"Empatheia,":[117],"for":[118,144],"MERG.":[119,190],"Built":[120],"upon":[121],"Large":[124],"Language":[125],"Model":[126],"(MLLM)":[127],"with":[128,139],"multimodal":[129],"encoder,":[130],"generators,":[134],"Empatheia":[135,177],"performs":[136],"end-to-end":[137],"MERG,":[138],"Chain-of-Empathetic":[140],"reasoning":[141],"mechanism":[142],"integrated":[143],"enhanced":[145],"empathy":[146],"understanding":[147],"reasoning.Finally,":[149],"devise":[151],"list":[153],"empathetic-enhanced":[155],"tuning":[156],"strategies,":[157],"strengthening":[158],"capabilities":[160],"emotional":[162],"accuracy":[163],"content,":[165],"avatar-profile":[166],"consistency":[167],"across":[168],"Experimental":[170],"results":[171],"on":[172,185],"AvaMERG":[173],"data":[174,192],"demonstrate":[175],"that":[176],"consistently":[178],"shows":[179],"superior":[180],"performance":[181],"than":[182],"baseline":[183],"methods":[184],"both":[186],"textual":[187],"All":[191],"code":[194],"open":[196],"at":[197],"https://AvaMERG.github.io/.":[198]},"counts_by_year":[{"year":2025,"cited_by_count":8}],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-10T00:00:00"}
