{"id":"https://openalex.org/W4415536330","doi":"https://doi.org/10.1145/3746027.3755093","title":"REA-Listener: Real-Time Listening Head Generation with Dynamic Emotion Modeling and Flexible Modality Adaptation","display_name":"REA-Listener: Real-Time Listening Head Generation with Dynamic Emotion Modeling and Flexible Modality Adaptation","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415536330","doi":"https://doi.org/10.1145/3746027.3755093"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755093","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755093","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5043519307","display_name":"Sizhe Zhao","orcid":"https://orcid.org/0009-0004-2797-0389"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Sizhe Zhao","raw_affiliation_strings":["Harbin Institute of Technology, Weihai, China"],"raw_orcid":"https://orcid.org/0009-0004-2797-0389","affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Weihai, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100424710","display_name":"Chenyang Wang","orcid":"https://orcid.org/0000-0003-1865-7572"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chenyang Wang","raw_affiliation_strings":["Harbin Institute of Technology, Weihai, China"],"raw_orcid":"https://orcid.org/0000-0003-1865-7572","affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Weihai, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002473193","display_name":"Weiyu Zhao","orcid":"https://orcid.org/0009-0002-2661-3692"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weiyu Zhao","raw_affiliation_strings":["Harbin Institute of Technology, Weihai, China"],"raw_orcid":"https://orcid.org/0009-0002-2661-3692","affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Weihai, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100731668","display_name":"Zonglin Li","orcid":"https://orcid.org/0000-0002-4181-310X"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zonglin Li","raw_affiliation_strings":["Harbin Institute of Technology, Weihai, China"],"raw_orcid":"https://orcid.org/0000-0002-4181-310X","affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Weihai, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092316073","display_name":"Ming Li","orcid":null},"institutions":[{"id":"https://openalex.org/I4210144143","display_name":"Inspur (China)","ror":"https://ror.org/0474p4r72","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210144143"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ming Li","raw_affiliation_strings":["Shandong Inspur Database Technology Co., Ltd, Jinan, China"],"raw_orcid":"https://orcid.org/0009-0008-1032-7281","affiliations":[{"raw_affiliation_string":"Shandong Inspur Database Technology Co., Ltd, Jinan, China","institution_ids":["https://openalex.org/I4210144143"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5084025984","display_name":"Shengping Zhang","orcid":"https://orcid.org/0000-0001-5200-3420"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shengping Zhang","raw_affiliation_strings":["Harbin Institute of Technology, Weihai, China"],"raw_orcid":"https://orcid.org/0000-0001-5200-3420","affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Weihai, China","institution_ids":["https://openalex.org/I204983213"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5043519307"],"corresponding_institution_ids":["https://openalex.org/I204983213"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.30232648,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"9733","last_page":"9742"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9921000003814697,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9921000003814697,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T13289","display_name":"Infant Health and Development","score":0.9853000044822693,"subfield":{"id":"https://openalex.org/subfields/3611","display_name":"Pharmacy"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9836999773979187,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/active-listening","display_name":"Active listening","score":0.6330999732017517},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5931000113487244},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5134000182151794},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.49729999899864197},{"id":"https://openalex.org/keywords/head","display_name":"Head (geology)","score":0.45249998569488525},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4293000102043152},{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.4002000093460083},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.34139999747276306}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8223000168800354},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6370000243186951},{"id":"https://openalex.org/C177291462","wikidata":"https://www.wikidata.org/wiki/Q423038","display_name":"Active listening","level":2,"score":0.6330999732017517},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5931000113487244},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5134000182151794},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.49729999899864197},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45910000801086426},{"id":"https://openalex.org/C2780312720","wikidata":"https://www.wikidata.org/wiki/Q5689100","display_name":"Head (geology)","level":2,"score":0.45249998569488525},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4293000102043152},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.4002000093460083},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.36880001425743103},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.34139999747276306},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.32670000195503235},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.30660000443458557},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.29980000853538513},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.2992999851703644},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.2948000133037567},{"id":"https://openalex.org/C38129911","wikidata":"https://www.wikidata.org/wiki/Q4820038","display_name":"Auditory scene analysis","level":3,"score":0.29319998621940613},{"id":"https://openalex.org/C177606310","wikidata":"https://www.wikidata.org/wiki/Q5674297","display_name":"Adaptability","level":2,"score":0.29019999504089355},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.28870001435279846},{"id":"https://openalex.org/C2780992000","wikidata":"https://www.wikidata.org/wiki/Q17016113","display_name":"Generator (circuit theory)","level":3,"score":0.28780001401901245},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.2709999978542328},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.25870001316070557}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755093","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755093","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G8248988726","display_name":null,"funder_award_id":"62402136","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W2008208299","https://openalex.org/W2134963900","https://openalex.org/W2150610356","https://openalex.org/W2769666294","https://openalex.org/W3019952993","https://openalex.org/W3081492798","https://openalex.org/W3180794345","https://openalex.org/W3197199219","https://openalex.org/W3204680331","https://openalex.org/W3211147706","https://openalex.org/W4225760288","https://openalex.org/W4281730245","https://openalex.org/W4283710679","https://openalex.org/W4312309978","https://openalex.org/W4312444931","https://openalex.org/W4385318467","https://openalex.org/W4385490328","https://openalex.org/W4386072021","https://openalex.org/W4390872769","https://openalex.org/W4402727135"],"related_works":[],"abstract_inverted_index":{"Listening":[0],"head":[1,11,55,111,128,147,159,187],"generation":[2,161],"aims":[3],"to":[4,15,51,76,143],"synthesize":[5],"realistic":[6],"and":[7,29,36,61,83,176],"responsive":[8],"non-verbal":[9],"listener":[10,104,121,146],"motions":[12],"that":[13,179],"respond":[14],"speakers":[16],"in":[17,38,149,185],"conversational":[18],"scenarios.":[19,40],"Existing":[20],"methods":[21,184],"typically":[22],"rely":[23],"on":[24,139,164],"fixed":[25],"audio-visual":[26],"input":[27],"modalities":[28],"predefined":[30],"emotion":[31,63,117],"labels,":[32],"limiting":[33],"their":[34],"adaptability":[35],"expressiveness":[37],"real-world":[39],"In":[41],"this":[42],"paper,":[43],"we":[44,66,106,133],"propose":[45,68],"a":[46,69,87,108,115,135,165],"novel":[47],"real-time":[48,172],"framework,":[49],"REA-Listener,":[50],"generate":[52],"high-fidelity":[53],"listening":[54,186],"videos":[56,148],"with":[57,114],"flexible":[58],"modality":[59,95],"adaptation":[60],"dynamic":[62],"modeling.":[64],"Specifically,":[65],"first":[67],"Modality-Adaptive":[70],"Mixture":[71],"of":[72,80,103],"Experts":[73],"(MA-MoE)":[74],"module":[75],"encode":[77],"arbitrary":[78],"combinations":[79],"speaker":[81,125],"audio":[82],"visual":[84],"signals":[85],"into":[86],"unified":[88],"embedding":[89],"space,":[90],"ensuring":[91],"robustness":[92],"under":[93],"partial":[94],"conditions.":[96],"To":[97],"further":[98],"enhance":[99],"the":[100],"temporal":[101],"consistency":[102],"emotion,":[105],"present":[107],"lightweight":[109],"emotional":[110],"dynamics":[112],"generator":[113],"multi-modal":[116],"predictor,":[118],"which":[119],"infers":[120],"emotions":[122],"dynamically":[123],"from":[124],"context":[126],"alongside":[127],"motion":[129,160],"coefficient":[130],"prediction.":[131],"Finally,":[132],"employ":[134],"3D-aware":[136],"renderer":[137],"based":[138],"3D":[140],"Gaussian":[141],"Splatting":[142],"produce":[144],"high-quality":[145],"real":[150],"time.":[151],"With":[152],"these":[153],"components,":[154],"our":[155,180],"approach":[156],"achieves":[157],"efficient":[158],"at":[162],"30fps":[163],"single":[166],"NVIDIA":[167],"RTX":[168],"3090":[169],"GPU,":[170],"supporting":[171],"interaction.":[173],"Extensive":[174],"evaluations":[175],"applications":[177],"demonstrate":[178],"method":[181],"outperforms":[182],"state-of-the-art":[183],"generation.":[188]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-25T00:00:00"}
