{"id":"https://openalex.org/W7154038423","doi":"https://doi.org/10.48550/arxiv.2604.09057","title":"Tora3: Trajectory-Guided Audio-Video Generation with Physical Coherence","display_name":"Tora3: Trajectory-Guided Audio-Video Generation with Physical Coherence","publication_year":2026,"publication_date":"2026-04-10","ids":{"openalex":"https://openalex.org/W7154038423","doi":"https://doi.org/10.48550/arxiv.2604.09057"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.09057","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09057","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.09057","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133533583","display_name":"Junchao Liao","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liao, Junchao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133519956","display_name":"Zhenghao Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zhenghao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133501398","display_name":"Xiangyu Meng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Meng, Xiangyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133494160","display_name":"Litao Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Litao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101601745","display_name":"Ziying Zhang","orcid":"https://orcid.org/0000-0002-1521-8470"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ziying","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133513350","display_name":"Siyu Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Siyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133535656","display_name":"Long Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Long","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5029210936","display_name":"W. Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Weizhi","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5133533583"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.298799991607666,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.298799991607666,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.20280000567436218,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.2012999951839447,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/kinematics","display_name":"Kinematics","score":0.7124999761581421},{"id":"https://openalex.org/keywords/coherence","display_name":"Coherence (philosophical gambling strategy)","score":0.6643000245094299},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.5144000053405762},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.47530001401901245},{"id":"https://openalex.org/keywords/trajectory","display_name":"Trajectory","score":0.4717000126838684},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.4602000117301941},{"id":"https://openalex.org/keywords/motion-capture","display_name":"Motion capture","score":0.41290000081062317},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4041999876499176}],"concepts":[{"id":"https://openalex.org/C39920418","wikidata":"https://www.wikidata.org/wiki/Q11476","display_name":"Kinematics","level":2,"score":0.7124999761581421},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.6643000245094299},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6480000019073486},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5929999947547913},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5430999994277954},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.5144000053405762},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.47530001401901245},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.4717000126838684},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.4602000117301941},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.41290000081062317},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4041999876499176},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.3643999993801117},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.34880000352859497},{"id":"https://openalex.org/C155542232","wikidata":"https://www.wikidata.org/wiki/Q736111","display_name":"Optical flow","level":3,"score":0.3384999930858612},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.3312999904155731},{"id":"https://openalex.org/C10161872","wikidata":"https://www.wikidata.org/wiki/Q557891","display_name":"Motion estimation","level":2,"score":0.3215999901294708},{"id":"https://openalex.org/C145565327","wikidata":"https://www.wikidata.org/wiki/Q852514","display_name":"Motion control","level":3,"score":0.3089999854564667},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.29420000314712524},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.29339998960494995},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.2883000075817108},{"id":"https://openalex.org/C95020103","wikidata":"https://www.wikidata.org/wiki/Q1813492","display_name":"Match moving","level":3,"score":0.2727999985218048},{"id":"https://openalex.org/C2985684807","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Text generation","level":2,"score":0.26179999113082886}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.09057","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09057","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.09057","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09057","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Audio-video":[0],"(AV)":[1],"generation":[2,66,167],"has":[3],"recently":[4],"made":[5],"strong":[6,170],"progress":[7],"in":[8,130],"perceptual":[9],"quality":[10,168],"and":[11,33,57,98,120,164],"multimodal":[12],"coherence,":[13],"yet":[14],"generating":[15],"content":[16],"with":[17,40,149],"plausible":[18],"motion-sound":[19,162],"relations":[20],"remains":[21],"challenging.":[22],"Existing":[23],"methods":[24],"often":[25],"produce":[26],"object":[27,74],"motions":[28],"that":[29,35,68,126,157],"are":[30,36],"visually":[31],"unstable":[32],"sounds":[34],"only":[37],"loosely":[38],"aligned":[39],"salient":[41],"motion":[42,97,106,152,160],"or":[43],"contact":[44],"events,":[45],"largely":[46],"because":[47],"they":[48],"lack":[49],"an":[50],"explicit":[51],"motion-aware":[52],"structure":[53],"shared":[54,78],"by":[55,72,115],"video":[56],"audio":[58],"generation.":[59],"We":[60,138],"present":[61],"Tora3,":[62],"a":[63,77,86,104,110,121,142],"trajectory-guided":[64],"AV":[65,144,166],"framework":[67],"improves":[69,159],"physical":[70],"coherence":[71,136],"using":[73],"trajectories":[75,84],"as":[76,85],"kinematic":[79,118],"prior.":[80],"Rather":[81],"than":[82],"treating":[83],"video-only":[87],"control":[88],"signal,":[89],"Tora3":[90,158],"uses":[91],"them":[92],"to":[93],"jointly":[94],"guide":[95],"visual":[96],"acoustic":[99],"events.":[100],"Specifically,":[101],"we":[102],"design":[103],"trajectory-aligned":[105],"representation":[107],"for":[108],"video,":[109],"kinematic-audio":[111],"alignment":[112],"module":[113],"driven":[114],"trajectory-derived":[116],"second-order":[117],"states,":[119],"hybrid":[122],"flow":[123],"matching":[124],"scheme":[125],"preserves":[127],"trajectory":[128],"fidelity":[129],"trajectory-conditioned":[131],"regions":[132],"while":[133],"maintaining":[134],"local":[135],"elsewhere.":[137],"further":[139],"curate":[140],"PAV,":[141],"large-scale":[143],"dataset":[145],"emphasizing":[146],"motion-relevant":[147],"patterns":[148],"automatically":[150],"extracted":[151],"annotations.":[153],"Extensive":[154],"experiments":[155],"show":[156],"realism,":[161],"synchronization,":[163],"overall":[165],"over":[169],"open-source":[171],"baselines.":[172]},"counts_by_year":[],"updated_date":"2026-04-14T06:08:25.285971","created_date":"2026-04-14T00:00:00"}
