{"id":"https://openalex.org/W7139092333","doi":"https://doi.org/10.48550/arxiv.2603.17427","title":"ECHO: Towards Emotionally Appropriate and Contextually Aware Interactive Head Generation","display_name":"ECHO: Towards Emotionally Appropriate and Contextually Aware Interactive Head Generation","publication_year":2026,"publication_date":"2026-03-18","ids":{"openalex":"https://openalex.org/W7139092333","doi":"https://doi.org/10.48550/arxiv.2603.17427"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.17427","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.17427","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.17427","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129929201","display_name":"Xiangyu Kong","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kong, Xiangyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088922727","display_name":"Xiaoyu Jin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Xiaoyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Pan, Yihan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Yihan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035524175","display_name":"Haoqin Sun","orcid":"https://orcid.org/0000-0002-8554-8969"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Haoqin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100661372","display_name":"Hengde Zhu","orcid":"https://orcid.org/0000-0001-7027-3969"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Hengde","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109238781","display_name":"Xiaoming Xu","orcid":"https://orcid.org/0009-0009-4670-8411"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Xiaoming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130069312","display_name":"Xiaoming Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Xiaoming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129859466","display_name":"Lu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Lu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130164554","display_name":"Siyang Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Siyang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5129929201"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.6909999847412109,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.6909999847412109,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.08860000222921371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.08259999752044678,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/avatar","display_name":"Avatar","score":0.5777999758720398},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5547999739646912},{"id":"https://openalex.org/keywords/articulation","display_name":"Articulation (sociology)","score":0.5245000123977661},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.44769999384880066},{"id":"https://openalex.org/keywords/contextual-design","display_name":"Contextual design","score":0.4339999854564667},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.43389999866485596},{"id":"https://openalex.org/keywords/facial-expression","display_name":"Facial expression","score":0.4259999990463257},{"id":"https://openalex.org/keywords/sensory-cue","display_name":"Sensory cue","score":0.4253999888896942}],"concepts":[{"id":"https://openalex.org/C2777365542","wikidata":"https://www.wikidata.org/wiki/Q83090","display_name":"Avatar","level":2,"score":0.5777999758720398},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5547999739646912},{"id":"https://openalex.org/C2779337067","wikidata":"https://www.wikidata.org/wiki/Q4800961","display_name":"Articulation (sociology)","level":3,"score":0.5245000123977661},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.49000000953674316},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.4878000020980835},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.45210000872612},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.44769999384880066},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4456000030040741},{"id":"https://openalex.org/C71611378","wikidata":"https://www.wikidata.org/wiki/Q5165191","display_name":"Contextual design","level":3,"score":0.4339999854564667},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.43389999866485596},{"id":"https://openalex.org/C195704467","wikidata":"https://www.wikidata.org/wiki/Q327968","display_name":"Facial expression","level":2,"score":0.4259999990463257},{"id":"https://openalex.org/C111370547","wikidata":"https://www.wikidata.org/wiki/Q7451120","display_name":"Sensory cue","level":2,"score":0.4253999888896942},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4246000051498413},{"id":"https://openalex.org/C207347870","wikidata":"https://www.wikidata.org/wiki/Q371174","display_name":"Gesture","level":2,"score":0.39590001106262207},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.33469998836517334},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.32850000262260437},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.3264000117778778},{"id":"https://openalex.org/C10090317","wikidata":"https://www.wikidata.org/wiki/Q7551030","display_name":"Social cue","level":2,"score":0.3075999915599823},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.2919999957084656},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.2766000032424927},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.27059999108314514},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.26840001344680786},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.2549999952316284},{"id":"https://openalex.org/C2777582232","wikidata":"https://www.wikidata.org/wiki/Q5013414","display_name":"CONTEST","level":2,"score":0.2538999915122986},{"id":"https://openalex.org/C173853756","wikidata":"https://www.wikidata.org/wiki/Q86915","display_name":"Dialog box","level":2,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.17427","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.17427","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.17427","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.17427","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0],"natural":[1],"face-to-face":[2],"interaction,":[3],"participants":[4],"seamlessly":[5],"alternate":[6],"between":[7],"speaking":[8],"and":[9,22,27,56,74,105,152,160,167,204,215],"listening,":[10],"producing":[11],"facial":[12,100,190],"behaviors":[13,55,101],"(FBs)":[14],"that":[15,144,176],"are":[16],"finely":[17],"informed":[18],"by":[19,193],"long-range":[20,93],"context":[21],"naturally":[23],"exhibit":[24],"contextual":[25,94,103,146,158,185],"appropriateness":[26,159],"emotional":[28,161],"rationality.":[29],"Interactive":[30],"Head":[31],"Generation":[32],"(IHG)":[33],"aims":[34],"to":[35,98,156,199],"synthesize":[36],"lifelike":[37],"avatar":[38,165],"head":[39],"video":[40],"emulating":[41],"such":[42],"capabilities.":[43],"Existing":[44],"IHG":[45,132,218],"methods":[46],"typically":[47],"condition":[48],"on":[49,88],"dual-track":[50,112],"signals":[51,113],"(i.e.,":[52],"human":[53],"user's":[54],"pre-defined":[57],"audio":[58],"for":[59,188],"avatar)":[60],"within":[61],"a":[62,130,138,168],"short":[63],"temporal":[64],"window,":[65],"jointly":[66,200],"driving":[67],"generation":[68],"of":[69,111,148,163,212],"avatar's":[70],"audio-aligned":[71],"lip":[72,179,202],"articulation":[73,180],"non-verbal":[75],"FBs.":[76],"However,":[77],"two":[78,135],"main":[79],"challenges":[80],"persist":[81],"in":[82],"these":[83],"methods:":[84],"(i)":[85],"the":[86,107,210],"reliance":[87],"short-clip":[89],"behavioral":[90,186],"cues":[91,187],"without":[92],"modeling":[95],"leads":[96],"them":[97],"produce":[99],"lacking":[102],"appropriateness;":[104],"(ii)":[106],"entangled,":[108],"role-agnostic":[109],"fusion":[110],"empirically":[114],"introduces":[115],"cross-signal":[116],"interference,":[117],"potentially":[118],"compromising":[119],"lip-region":[120],"synchronization":[121,203],"during":[122],"speaking.":[123],"To":[124],"this":[125],"end,":[126],"we":[127],"propose":[128],"ECHO,":[129],"novel":[131],"framework":[133],"comprising":[134],"key":[136],"components:":[137],"Long-range":[139],"Contextual":[140],"Understanding":[141],"(LCU)":[142],"component":[143],"facilitates":[145],"understanding":[147],"both":[149],"behavior-grounded":[150],"dynamics":[151],"linguistic-driven":[153],"affective":[154],"semantics":[155],"promote":[157],"rationality":[162],"synthesized":[164],"FBs;":[166],"block-wise":[169],"Spatial-aware":[170],"Decoupled":[171],"Cross-attention":[172],"Modulation":[173],"(SDCM)":[174],"module,":[175],"preserves":[177],"self-audio-driven":[178],"while":[181],"adaptively":[182],"integrating":[183],"user":[184],"non-lip":[189],"regions,":[191],"complemented":[192],"our":[194],"designed":[195],"two-stage":[196],"training":[197],"paradigm,":[198],"enhance":[201],"visual":[205],"fidelity.":[206],"Extensive":[207],"experiments":[208],"demonstrate":[209],"effectiveness":[211],"proposed":[213],"components":[214],"ECHO's":[216],"superior":[217],"performance.":[219]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2026-03-20T00:00:00"}
