{"id":"https://openalex.org/W7139922230","doi":"https://doi.org/10.48550/arxiv.2603.18600","title":"Improving Joint Audio-Video Generation with Cross-Modal Context Learning","display_name":"Improving Joint Audio-Video Generation with Cross-Modal Context Learning","publication_year":2026,"publication_date":"2026-03-19","ids":{"openalex":"https://openalex.org/W7139922230","doi":"https://doi.org/10.48550/arxiv.2603.18600"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.18600","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18600","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.18600","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5112991062","display_name":"Bingqi Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Bingqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130229331","display_name":"Linlong Lang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lang, Linlong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130254013","display_name":"Ming Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130238271","display_name":"Dailan He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Dailan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062853294","display_name":"Xingtong Ge","orcid":"https://orcid.org/0000-0001-7603-2832"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ge, Xingtong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130249880","display_name":"Yi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130225060","display_name":"Guanglu Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Guanglu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130243801","display_name":"Yu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.45350000262260437,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.45350000262260437,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.1949000060558319,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.1281999945640564,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/context-model","display_name":"Context model","score":0.5659000277519226},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4684999883174896},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4239000082015991},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.3752000033855438},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.3671000003814697},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.34290000796318054},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.32089999318122864}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7883999943733215},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.5659000277519226},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.534500002861023},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.49470001459121704},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4684999883174896},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4239000082015991},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.3752000033855438},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.3671000003814697},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.34290000796318054},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.32089999318122864},{"id":"https://openalex.org/C2781368080","wikidata":"https://www.wikidata.org/wiki/Q501688","display_name":"Context awareness","level":3,"score":0.3102000057697296},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.30250000953674316},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.2992999851703644},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2930000126361847},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2750999927520752},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.26750001311302185},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.266400009393692},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.2563000023365021},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.2547999918460846}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.18600","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18600","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.18600","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18600","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0,134],"dual-stream":[1,52],"transformer":[2,53],"architecture-based":[3],"joint":[4],"audio-video":[5,36],"generation":[6,173],"method":[7],"has":[8],"become":[9],"the":[10,51,66,82,124,145,168,182],"dominant":[11],"paradigm":[12,54],"in":[13,73,84,144],"current":[14],"research.":[15],"By":[16],"incorporating":[17],"pre-trained":[18],"video":[19,131],"diffusion":[20,24],"models":[21],"and":[22,55,81,91,119,130,139,172,197],"audio":[23,128],"models,":[25],"along":[26],"with":[27,41,111,209],"a":[28],"cross-modal":[29,70,79,156],"interaction":[30],"attention":[31],"module,":[32],"high-quality,":[33],"temporally":[34],"synchronized":[35],"content":[37],"can":[38],"be":[39],"generated":[40],"minimal":[42],"training":[43,90,164],"data.":[44],"In":[45],"this":[46],"paper,":[47],"we":[48,104],"first":[49],"revisit":[50],"further":[56,166,198],"analyze":[57],"its":[58],"limitations,":[59],"including":[60],"model":[61],"manifold":[62],"variations":[63],"caused":[64],"by":[65,78,186],"gating":[67],"mechanism":[68],"controlling":[69],"interactions,":[71],"biases":[72],"multi-modal":[74,85],"background":[75],"regions":[76],"introduced":[77],"attention,":[80],"inconsistencies":[83],"classifier-free":[86],"guidance":[87],"(CFG)":[88],"during":[89],"inference,":[92,176],"as":[93,95],"well":[94],"conflicts":[96],"between":[97,127],"multiple":[98],"conditions.":[99],"To":[100],"alleviate":[101],"these":[102],"issues,":[103],"propose":[105],"Cross-Modal":[106,146],"Context":[107,136,141,147,178],"Learning":[108],"(CCL),":[109],"equipped":[110],"several":[112],"carefully":[113],"designed":[114],"modules.":[115],"Temporally":[116],"Aligned":[117],"RoPE":[118],"Partitioning":[120],"(TARP)":[121],"effectively":[122],"enhances":[123],"temporal":[125],"alignment":[126],"latent":[129,132],"representations.":[133],"Learnable":[135],"Tokens":[137],"(LCT)":[138],"Dynamic":[140],"Routing":[142],"(DCR)":[143],"Attention":[148],"(CCA)":[149],"module":[150],"provide":[151],"stable":[152],"unconditional":[153,183],"anchors":[154],"for":[155],"information,":[157],"while":[158,213],"dynamically":[159],"routing":[160],"based":[161],"on":[162],"different":[163,190],"tasks,":[165],"enhancing":[167],"model's":[169],"convergence":[170],"speed":[171],"quality.":[174],"During":[175],"Unconditional":[177],"Guidance":[179],"(UCG)":[180],"leverages":[181],"support":[184],"provided":[185],"LCT":[187],"to":[188],"facilitate":[189],"forms":[191],"of":[192],"CFG,":[193],"improving":[194],"train-inference":[195],"consistency":[196],"alleviating":[199],"conflicts.":[200],"Through":[201],"comprehensive":[202],"evaluations,":[203],"CCL":[204],"achieves":[205],"state-of-the-art":[206],"performance":[207],"compared":[208],"recent":[210],"academic":[211],"methods":[212],"requiring":[214],"substantially":[215],"fewer":[216],"resources.":[217]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-21T00:00:00"}
