{"id":"https://openalex.org/W4417125240","doi":"https://doi.org/10.1145/3757377.3763952","title":"X-UniMotion: Animating Human Images with Expressive, Unified and Identity-Agnostic Motion Latents","display_name":"X-UniMotion: Animating Human Images with Expressive, Unified and Identity-Agnostic Motion Latents","publication_year":2025,"publication_date":"2025-12-08","ids":{"openalex":"https://openalex.org/W4417125240","doi":"https://doi.org/10.1145/3757377.3763952"},"language":null,"primary_location":{"id":"doi:10.1145/3757377.3763952","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3757377.3763952","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the SIGGRAPH Asia 2025 Conference Papers","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5012261844","display_name":"Guoxian Song","orcid":"https://orcid.org/0000-0002-3664-572X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Guoxian Song","raw_affiliation_strings":["ByteDance, San Jose, USA"],"affiliations":[{"raw_affiliation_string":"ByteDance, San Jose, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100691898","display_name":"Hongyi Xu","orcid":"https://orcid.org/0009-0006-4455-5632"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hongyi Xu","raw_affiliation_strings":["ByteDance, San Jose, USA"],"affiliations":[{"raw_affiliation_string":"ByteDance, San Jose, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070505636","display_name":"Xiaochen Zhao","orcid":"https://orcid.org/0000-0001-8976-7723"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaochen Zhao","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102446690","display_name":"You Xie","orcid":"https://orcid.org/0000-0001-7320-6518"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"You Xie","raw_affiliation_strings":["ByteDance, San Jose, USA"],"affiliations":[{"raw_affiliation_string":"ByteDance, San Jose, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020924907","display_name":"Tianpei Gu","orcid":"https://orcid.org/0000-0003-3173-8895"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tianpei Gu","raw_affiliation_strings":["ByteDance, San Jose, USA"],"affiliations":[{"raw_affiliation_string":"ByteDance, San Jose, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063345044","display_name":"Zenan Li","orcid":"https://orcid.org/0009-0001-7794-5358"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zenan Li","raw_affiliation_strings":["ByteDance, San Jose, USA"],"affiliations":[{"raw_affiliation_string":"ByteDance, San Jose, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078876971","display_name":"Chenxu Zhang","orcid":"https://orcid.org/0000-0002-1971-1975"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chenxu Zhang","raw_affiliation_strings":["ByteDance, San Jose, USA"],"affiliations":[{"raw_affiliation_string":"ByteDance, San Jose, USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5081407963","display_name":"Linjie Luo","orcid":"https://orcid.org/0000-0001-6322-1175"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Linjie Luo","raw_affiliation_strings":["ByteDance, San Jose, USA"],"affiliations":[{"raw_affiliation_string":"ByteDance, San Jose, USA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5012261844"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.2158,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.85289329,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"11"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.4187999963760376,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.4187999963760376,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.3276999890804291,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.12890000641345978,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.7127000093460083},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5764999985694885},{"id":"https://openalex.org/keywords/motion-capture","display_name":"Motion capture","score":0.5600000023841858},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5576000213623047},{"id":"https://openalex.org/keywords/heuristic","display_name":"Heuristic","score":0.48750001192092896},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4530999958515167},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.41280001401901245},{"id":"https://openalex.org/keywords/computer-facial-animation","display_name":"Computer facial animation","score":0.40310001373291016},{"id":"https://openalex.org/keywords/animation","display_name":"Animation","score":0.3970000147819519},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.39070001244544983}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7849000096321106},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.7127000093460083},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7020000219345093},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.6082000136375427},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5764999985694885},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.5600000023841858},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5576000213623047},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.48750001192092896},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4530999958515167},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.41280001401901245},{"id":"https://openalex.org/C138591656","wikidata":"https://www.wikidata.org/wiki/Q5157538","display_name":"Computer facial animation","level":4,"score":0.40310001373291016},{"id":"https://openalex.org/C502989409","wikidata":"https://www.wikidata.org/wiki/Q11425","display_name":"Animation","level":2,"score":0.3970000147819519},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.39070001244544983},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.3871000111103058},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.3693999946117401},{"id":"https://openalex.org/C10161872","wikidata":"https://www.wikidata.org/wiki/Q557891","display_name":"Motion estimation","level":2,"score":0.364300012588501},{"id":"https://openalex.org/C2986578859","wikidata":"https://www.wikidata.org/wiki/Q657632","display_name":"Human motion","level":3,"score":0.35530000925064087},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.33320000767707825},{"id":"https://openalex.org/C90559484","wikidata":"https://www.wikidata.org/wiki/Q778379","display_name":"Expression (computer science)","level":2,"score":0.3305000066757202},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.323199987411499},{"id":"https://openalex.org/C195704467","wikidata":"https://www.wikidata.org/wiki/Q327968","display_name":"Facial expression","level":2,"score":0.31839999556541443},{"id":"https://openalex.org/C69369342","wikidata":"https://www.wikidata.org/wiki/Q1401416","display_name":"Computer animation","level":3,"score":0.2985999882221222},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.29580000042915344},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.28700000047683716},{"id":"https://openalex.org/C2776175482","wikidata":"https://www.wikidata.org/wiki/Q1195816","display_name":"Transfer (computing)","level":2,"score":0.2791999876499176},{"id":"https://openalex.org/C77660652","wikidata":"https://www.wikidata.org/wiki/Q150971","display_name":"Computer graphics","level":2,"score":0.27549999952316284},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.27399998903274536},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.27149999141693115},{"id":"https://openalex.org/C2779757391","wikidata":"https://www.wikidata.org/wiki/Q6002292","display_name":"Image translation","level":3,"score":0.2703999876976013},{"id":"https://openalex.org/C72560505","wikidata":"https://www.wikidata.org/wiki/Q204510","display_name":"Motion interpolation","level":5,"score":0.2621999979019165},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.2612000107765198},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.26100000739097595},{"id":"https://openalex.org/C2779231336","wikidata":"https://www.wikidata.org/wiki/Q7534724","display_name":"Sketch","level":2,"score":0.260699987411499}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3757377.3763952","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3757377.3763952","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the SIGGRAPH Asia 2025 Conference Papers","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":13,"referenced_works":["https://openalex.org/W2969985801","https://openalex.org/W2978956737","https://openalex.org/W3178284600","https://openalex.org/W3180770160","https://openalex.org/W4312473638","https://openalex.org/W4312933868","https://openalex.org/W4386066404","https://openalex.org/W4400573497","https://openalex.org/W4402704510","https://openalex.org/W4402754063","https://openalex.org/W4402951553","https://openalex.org/W4403791401","https://openalex.org/W4413144436"],"related_works":[],"abstract_inverted_index":{"We":[0,149],"present":[1],"X-UniMotion,":[2],"a":[3,45,49,97,112],"unified":[4],"and":[5,19,33,61,64,75,89,108,133,165,185],"expressive":[6,74,179],"implicit":[7],"latent":[8,55,109],"representation":[9,110],"for":[10,58,192],"whole-body":[11],"human":[12,41,124],"motion,":[13],"encompassing":[14],"facial":[15,59],"expressions,":[16],"body":[17,62],"poses,":[18],"hand":[20],"gestures.":[21],"Unlike":[22],"prior":[23],"motion":[24,42,69,81,106,155,167,183],"transfer":[25,82],"methods":[26],"that":[27,102,172],"rely":[28],"on":[29,118],"explicit":[30],"skeletal":[31],"poses":[32,88],"heuristic":[34],"cross-identity":[35,80,143],"adjustments,":[36],"our":[37],"approach":[38],"encodes":[39],"multi-granular":[40],"directly":[43],"from":[44],"single":[46],"image":[47],"into":[48],"compact":[50],"set":[51],"of":[52,142,154],"four":[53],"disentangled":[54],"tokens\u2014one":[56],"each":[57],"expression":[60],"pose,":[63],"one":[65],"per":[66],"hand.":[67],"These":[68],"latents":[70],"are":[71],"both":[72],"highly":[73,178],"identity-agnostic,":[76],"enabling":[77],"high-fidelity,":[78],"detailed":[79],"across":[83],"subjects":[84],"with":[85,181],"distinct":[86],"identities,":[87],"spatial":[90,132],"configurations.":[91],"To":[92],"achieve":[93],"this,":[94],"we":[95],"introduce":[96],"self-supervised,":[98],"end-to-end":[99],"training":[100],"framework":[101],"jointly":[103],"learns":[104],"the":[105,152],"encoder":[107],"alongside":[111],"DiT-based":[113],"video":[114,120],"generative":[115],"model,":[116],"trained":[117],"large-scale":[119],"datasets":[121],"spanning":[122],"diverse":[123],"motions.":[125],"Motion-identity":[126],"disentanglement":[127],"is":[128],"enforced":[129],"via":[130],"2D":[131],"color":[134],"augmentations,":[135],"as":[136,138],"well":[137],"synthetic":[139],"3D":[140],"renderings":[141],"subject":[144],"pairs":[145],"under":[146],"shared":[147],"poses.":[148],"further":[150],"guide":[151],"learning":[153],"tokens":[156],"using":[157],"auxiliary":[158],"decoders":[159],"to":[160,190],"promote":[161],"fine-grained,":[162],"semantically":[163],"aligned,":[164],"normal-aware":[166],"embeddings.":[168],"Extensive":[169],"experiments":[170],"demonstrate":[171],"X-UniMotion":[173],"outperforms":[174],"state-of-the-art":[175],"methods,":[176],"producing":[177],"animations":[180],"superior":[182],"expressiveness":[184],"identity":[186],"preservation.":[187],"Please":[188],"refer":[189],"https://byteaigc.github.io/X-Unimotion/":[191],"more":[193],"information.":[194]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-12-08T00:00:00"}
