{"id":"https://openalex.org/W4415538104","doi":"https://doi.org/10.1145/3746027.3754869","title":"Tora2: Motion and Appearance Customized Diffusion Transformer for Multi-Entity Video Generation","display_name":"Tora2: Motion and Appearance Customized Diffusion Transformer for Multi-Entity Video Generation","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415538104","doi":"https://doi.org/10.1145/3746027.3754869"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3754869","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3754869","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3746027.3754869","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zhenghao Zhang","orcid":"https://orcid.org/0009-0006-7229-1398"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhenghao Zhang","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0006-7229-1398","affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018182257","display_name":"Junchao Liao","orcid":"https://orcid.org/0000-0003-4282-0843"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junchao Liao","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-4282-0843","affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Xiangyu Meng","orcid":"https://orcid.org/0009-0001-6224-7979"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiangyu Meng","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0001-6224-7979","affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Long Qin","orcid":"https://orcid.org/0009-0003-4712-8570"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Long Qin","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0003-4712-8570","affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5029210936","display_name":"W. Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weizhi Wang","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0003-7874-9468","affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I45928872"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.28716056,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"9434","last_page":"9443"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/personalization","display_name":"Personalization","score":0.6851999759674072},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.5098000168800354},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4025999903678894},{"id":"https://openalex.org/keywords/motion-control","display_name":"Motion control","score":0.3864000141620636},{"id":"https://openalex.org/keywords/motion-estimation","display_name":"Motion estimation","score":0.33570000529289246},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.3336000144481659}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7595000267028809},{"id":"https://openalex.org/C183003079","wikidata":"https://www.wikidata.org/wiki/Q1000371","display_name":"Personalization","level":2,"score":0.6851999759674072},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5360999703407288},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.5098000168800354},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5040000081062317},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4025999903678894},{"id":"https://openalex.org/C145565327","wikidata":"https://www.wikidata.org/wiki/Q852514","display_name":"Motion control","level":3,"score":0.3864000141620636},{"id":"https://openalex.org/C10161872","wikidata":"https://www.wikidata.org/wiki/Q557891","display_name":"Motion estimation","level":2,"score":0.33570000529289246},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.3336000144481659},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.33000001311302185},{"id":"https://openalex.org/C128840427","wikidata":"https://www.wikidata.org/wiki/Q1302174","display_name":"Motion compensation","level":2,"score":0.30799999833106995},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.28439998626708984},{"id":"https://openalex.org/C117978034","wikidata":"https://www.wikidata.org/wiki/Q5422192","display_name":"Extractor","level":2,"score":0.2614000141620636}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3754869","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3754869","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3746027.3754869","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3754869","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W1901129140","https://openalex.org/W2603777577","https://openalex.org/W2969985801","https://openalex.org/W4312473638","https://openalex.org/W4312547276","https://openalex.org/W4385259380","https://openalex.org/W4386072096","https://openalex.org/W4386076280","https://openalex.org/W4390872297","https://openalex.org/W4402727699","https://openalex.org/W4402753906","https://openalex.org/W4402754134","https://openalex.org/W4402754215","https://openalex.org/W4402961675","https://openalex.org/W4403770406","https://openalex.org/W4404612908","https://openalex.org/W4404900558","https://openalex.org/W4404971889"],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,37,93,164],"diffusion":[3],"transformer":[4],"models":[5],"for":[6,55,85,138],"motion-guided":[7],"video":[8,139,166],"generation,":[9],"such":[10],"as":[11],"Tora,":[12,27],"have":[13],"shown":[14],"significant":[15],"progress.":[16],"In":[17],"this":[18],"paper,":[19],"we":[20,44,71,99],"present":[21],"Tora2,":[22],"an":[23],"enhanced":[24],"version":[25],"of":[26,134],"which":[28,159],"introduces":[29],"several":[30],"design":[31,72],"improvements":[32],"to":[33,65,77,122,129],"expand":[34],"its":[35],"capabilities":[36],"both":[38],"appearance":[39,135],"and":[40,82,109,117,136],"motion":[41,116,137,156],"customization.":[42],"Specifically,":[43],"introduce":[45,100],"a":[46,73,101,161],"decoupled":[47],"personalization":[48,53,118],"extractor":[49],"that":[50,104,144],"generates":[51],"comprehensive":[52],"embeddings":[54],"multiple":[56],"open-set":[57],"entities,":[58],"better":[59],"preserving":[60],"fine-grained":[61],"visual":[62,83],"details":[63],"compared":[64],"previous":[66],"methods.":[67],"Building":[68],"on":[69],"this,":[70],"gated":[74],"self-attention":[75],"mechanism":[76],"integrate":[78],"trajectory,":[79],"textual":[80],"description,":[81],"information":[84],"each":[86],"entity.":[87],"This":[88],"innovation":[89],"significantly":[90],"reduces":[91],"misalignment":[92],"multimodal":[94],"conditioning":[95],"during":[96],"training.":[97],"Moreover,":[98],"contrastive":[102],"loss":[103],"jointly":[105],"optimizes":[106],"trajectory":[107],"dynamics":[108],"entity":[110],"consistency":[111],"through":[112],"explicit":[113],"mapping":[114],"between":[115],"embeddings.":[119],"Tora2":[120,145],"is,":[121],"our":[123],"best":[124],"knowledge,":[125],"the":[126],"first":[127],"method":[128],"achieve":[130],"simultaneous":[131],"multi-entity":[132],"customization":[133,151],"generation.":[140,167],"Experimental":[141],"results":[142],"demonstrate":[143],"achieves":[146],"competitive":[147],"performance":[148],"with":[149],"state-of-the-art":[150],"methods":[152],"while":[153],"providing":[154],"advanced":[155],"control":[157],"capabilities,":[158],"marks":[160],"critical":[162],"advancement":[163],"multi-condition":[165]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-25T00:00:00"}
