{"id":"https://openalex.org/W7127451256","doi":"https://doi.org/10.48550/arxiv.2602.00702","title":"JoyStreamer: Unlocking Highly Expressive Avatars via Harmonized Text-Audio Conditioning","display_name":"JoyStreamer: Unlocking Highly Expressive Avatars via Harmonized Text-Audio Conditioning","publication_year":2026,"publication_date":"2026-01-31","ids":{"openalex":"https://openalex.org/W7127451256","doi":"https://doi.org/10.48550/arxiv.2602.00702"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.00702","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124890102","display_name":"Ruikui Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Ruikui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124885870","display_name":"Jinheng Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Jinheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124930353","display_name":"Lang Tian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Lang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060295796","display_name":"Huaishao Luo","orcid":"https://orcid.org/0000-0003-4297-1270"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Huaishao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124922509","display_name":"Chaochao Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chaochao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124913381","display_name":"Liangbo Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Liangbo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124946818","display_name":"Huan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Huan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101324413","display_name":"Youzheng Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Youzheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124899685","display_name":"Xiaodong He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Xiaodong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5124890102"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.28630000352859497,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.28630000352859497,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.19609999656677246,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.08330000191926956,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/avatar","display_name":"Avatar","score":0.9156000018119812},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.6485999822616577},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.4715000092983246},{"id":"https://openalex.org/keywords/transfer-of-learning","display_name":"Transfer of learning","score":0.40959998965263367},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.3434999883174896},{"id":"https://openalex.org/keywords/duration","display_name":"Duration (music)","score":0.3224000036716461},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.3043999969959259}],"concepts":[{"id":"https://openalex.org/C2777365542","wikidata":"https://www.wikidata.org/wiki/Q83090","display_name":"Avatar","level":2,"score":0.9156000018119812},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.784500002861023},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.6485999822616577},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5040000081062317},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.5029000043869019},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.4715000092983246},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.40959998965263367},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.35499998927116394},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.3434999883174896},{"id":"https://openalex.org/C112758219","wikidata":"https://www.wikidata.org/wiki/Q16038819","display_name":"Duration (music)","level":2,"score":0.3224000036716461},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.3043999969959259},{"id":"https://openalex.org/C83248878","wikidata":"https://www.wikidata.org/wiki/Q344000","display_name":"Active appearance model","level":3,"score":0.2930000126361847},{"id":"https://openalex.org/C2776175482","wikidata":"https://www.wikidata.org/wiki/Q1195816","display_name":"Transfer (computing)","level":2,"score":0.2915000021457672},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.2558000087738037},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.2549999952316284},{"id":"https://openalex.org/C45262634","wikidata":"https://www.wikidata.org/wiki/Q5159291","display_name":"Conditioning","level":2,"score":0.2515000104904175},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.00702","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.00702","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.00702","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.00702","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.7249399423599243}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Existing":[0],"video":[1,197],"avatar":[2,65,136,156],"models":[3,176],"have":[4],"demonstrated":[5],"impressive":[6],"capabilities":[7],"in":[8],"scenarios":[9],"such":[10,158,177],"as":[11,150,152,159,178],"talking,":[12],"public":[13],"speaking,":[14],"and":[15,110,146,162,180,192],"singing.":[16],"However,":[17],"the":[18,33,82,89,103,114,123,135,154,174],"majority":[19],"of":[20,61,105],"these":[21],"methods":[22],"exhibit":[23],"limited":[24],"alignment":[25],"with":[26],"respect":[27],"to":[28,84,119,132,139],"text":[29],"instructions,":[30],"particularly":[31],"when":[32],"prompts":[34],"involve":[35],"complex":[36,187],"elements":[37],"including":[38,189],"large":[39],"full-body":[40,144],"movement,":[41],"dynamic":[42,147],"camera":[43,148],"trajectory,":[44],"background":[45],"transitions,":[46],"or":[47],"human-object":[48],"interactions.":[49],"To":[50],"break":[51],"out":[52],"this":[53],"limitation,":[54],"we":[55,73,100],"present":[56],"JoyAvatar,":[57],"a":[58,75],"framework":[59],"capable":[60],"generating":[62],"long":[63],"duration":[64],"videos,":[66],"featuring":[67],"two":[68,128],"key":[69,129],"technical":[70],"innovations.":[71],"Firstly,":[72],"introduce":[74],"twin-teacher":[76],"enhanced":[77],"training":[78],"algorithm":[79],"that":[80,169],"enables":[81,186],"model":[83,91,172],"transfer":[85],"inherent":[86],"text-controllability":[87],"from":[88],"foundation":[90],"while":[92],"simultaneously":[93],"learning":[94],"audio-visual":[95],"synchronization.":[96],"Secondly,":[97],"during":[98],"training,":[99],"dynamically":[101],"modulate":[102],"strength":[104],"multi-modal":[106],"conditions":[107],"(e.g.,":[108],"audio":[109],"text)":[111],"based":[112],"on":[113,201],"distinct":[115],"denoising":[116],"timestep,":[117],"aiming":[118],"mitigate":[120],"conflicts":[121],"between":[122],"heterogeneous":[124],"conditioning":[125],"signals.":[126],"These":[127],"designs":[130],"serve":[131],"substantially":[133],"expand":[134],"model's":[137],"capacity":[138],"generate":[140],"natural,":[141],"temporally":[142],"coherent":[143],"motions":[145],"movements":[149],"well":[151],"preserve":[153],"basic":[155],"capabilities,":[157],"accurate":[160],"lip-sync":[161],"identity":[163],"consistency.":[164],"GSB":[165],"evaluation":[166],"results":[167],"demonstrate":[168],"our":[170,184],"JoyStreamer":[171],"outperforms":[173],"state-of-the-art":[175],"Omnihuman-1.5":[179],"KlingAvatar":[181],"2.0.":[182],"Moreover,":[183],"approach":[185],"applications":[188],"multi-person":[190],"dialogues":[191],"non-human":[193],"subjects":[194],"role-playing.":[195],"Some":[196],"samples":[198],"are":[199],"provided":[200],"https://joystreamer.github.io/.":[202]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-04T00:00:00"}
