{"id":"https://openalex.org/W7131394878","doi":"https://doi.org/10.48550/arxiv.2602.19163","title":"JavisDiT++: Unified Modeling and Optimization for Joint Audio-Video Generation","display_name":"JavisDiT++: Unified Modeling and Optimization for Joint Audio-Video Generation","publication_year":2026,"publication_date":"2026-02-22","ids":{"openalex":"https://openalex.org/W7131394878","doi":"https://doi.org/10.48550/arxiv.2602.19163"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2602.19163","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19163","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2602.19163","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126817590","display_name":"Kai Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liu, Kai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126722550","display_name":"Yanhao Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Yanhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126849863","display_name":"Kai Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Kai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126730562","display_name":"Shengqiong Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Shengqiong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126704373","display_name":"Rongjunchen Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Rongjunchen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126403130","display_name":"Jiebo Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Jiebo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Hatzinakos, Dimitrios","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hatzinakos, Dimitrios","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126806717","display_name":"Ziwei Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ziwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019435679","display_name":"Fei Hao","orcid":"https://orcid.org/0000-0003-4942-0893"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fei, Hao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126788620","display_name":"Tat-Seng Chua","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chua, Tat-Seng","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5126817590"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.2549000084400177,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.2549000084400177,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.19750000536441803,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.19609999656677246,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.6442999839782715},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.6133000254631042},{"id":"https://openalex.org/keywords/rope","display_name":"Rope","score":0.5778999924659729},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.5342000126838684},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5335999727249146},{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.44209998846054077},{"id":"https://openalex.org/keywords/task-analysis","display_name":"Task analysis","score":0.3953999876976013}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8061000108718872},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.6442999839782715},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.6133000254631042},{"id":"https://openalex.org/C162269090","wikidata":"https://www.wikidata.org/wiki/Q1156047","display_name":"Rope","level":2,"score":0.5778999924659729},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.5342000126838684},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5335999727249146},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.49459999799728394},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.44209998846054077},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3953999876976013},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.36820000410079956},{"id":"https://openalex.org/C3018907156","wikidata":"https://www.wikidata.org/wiki/Q7785058","display_name":"Third generation","level":2,"score":0.36550000309944153},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.36149999499320984},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.310699999332428},{"id":"https://openalex.org/C45493050","wikidata":"https://www.wikidata.org/wiki/Q7884934","display_name":"Unified Model","level":2,"score":0.29919999837875366},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.2770000100135803},{"id":"https://openalex.org/C17137986","wikidata":"https://www.wikidata.org/wiki/Q215067","display_name":"Orthogonality","level":2,"score":0.27160000801086426},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.26930001378059387},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2615000009536743},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.26080000400543213}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2602.19163","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19163","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2602.19163","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19163","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.42621445655822754,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"AIGC":[0],"has":[1,22],"rapidly":[2],"expanded":[3],"from":[4,37,54],"text-to-image":[5],"generation":[6,20,57,102],"toward":[7],"high-quality":[8],"multimodal":[9],"synthesis":[10],"across":[11,139],"video":[12,120],"and":[13,31,35,61,82,119,142,167,188],"audio.":[14],"Within":[15],"this":[16,70],"context,":[17],"joint":[18],"audio-video":[19,126],"(JAVG)":[21],"emerged":[23],"as":[24,47],"a":[25,74,89,107],"fundamental":[26],"task":[27],"that":[28,94],"produces":[29],"synchronized":[30],"semantically":[32],"aligned":[33],"sound":[34],"vision":[36],"textual":[38],"descriptions.":[39],"However,":[40],"compared":[41],"with":[42,63,136,154],"advanced":[43],"commercial":[44],"models":[45],"such":[46],"Veo3,":[48],"existing":[49],"open-source":[50],"methods":[51],"still":[52],"suffer":[53],"limitations":[55],"in":[56,164],"quality,":[58,140],"temporal":[59],"synchrony,":[60],"alignment":[62],"human":[64,137],"preferences.":[65],"To":[66],"bridge":[67],"the":[68,178,185],"gap,":[69],"paper":[71],"presents":[72],"JavisDiT++,":[73],"concise":[75],"yet":[76],"powerful":[77],"framework":[78],"for":[79],"unified":[80],"modeling":[81],"optimization":[83,129],"of":[84,180],"JAVG.":[85],"First,":[86],"we":[87,105,123],"introduce":[88],"modality-specific":[90],"mixture-of-experts":[91],"(MS-MoE)":[92],"design":[93],"enables":[95],"cross-modal":[96],"interaction":[97],"efficacy":[98],"while":[99],"enhancing":[100],"single-modal":[101],"quality.":[103],"Then,":[104],"propose":[106],"temporal-aligned":[108],"RoPE":[109],"(TA-RoPE)":[110],"strategy":[111],"to":[112,132,176],"achieve":[113],"explicit,":[114],"frame-level":[115],"synchronization":[116],"between":[117],"audio":[118],"tokens.":[121],"Besides,":[122],"develop":[124],"an":[125],"direct":[127],"preference":[128,138],"(AV-DPO)":[130],"method":[131],"align":[133],"model":[134,149],"outputs":[135],"consistency,":[141],"synchrony":[143],"dimensions.":[144],"Built":[145],"upon":[146],"Wan2.1-1.3B-T2V,":[147],"our":[148,181],"achieves":[150],"state-of-the-art":[151],"performance":[152],"merely":[153],"around":[155],"1M":[156],"public":[157],"training":[158],"entries,":[159],"significantly":[160],"outperforming":[161],"prior":[162],"approaches":[163],"both":[165],"qualitative":[166],"quantitative":[168],"evaluations.":[169],"Comprehensive":[170],"ablation":[171],"studies":[172],"have":[173],"been":[174],"conducted":[175],"validate":[177],"effectiveness":[179],"proposed":[182],"modules.":[183],"All":[184],"code,":[186],"model,":[187],"dataset":[189],"are":[190],"released":[191],"at":[192],"https://JavisVerse.github.io/JavisDiT2-page.":[193]},"counts_by_year":[],"updated_date":"2026-03-25T23:56:10.502304","created_date":"2026-02-26T00:00:00"}
