{"id":"https://openalex.org/W7134895952","doi":"https://doi.org/10.48550/arxiv.2603.09721","title":"FrameDiT: Diffusion Transformer with Matrix Attention for Efficient Video Generation","display_name":"FrameDiT: Diffusion Transformer with Matrix Attention for Efficient Video Generation","publication_year":2026,"publication_date":"2026-03-10","ids":{"openalex":"https://openalex.org/W7134895952","doi":"https://doi.org/10.48550/arxiv.2603.09721"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.09721","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09721","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.09721","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128732302","display_name":"Minh Khoa Le","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Le, Minh Khoa","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128800567","display_name":"Kien Do","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Do, Kien","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024634557","display_name":"Duc Thanh Nguyen","orcid":"https://orcid.org/0000-0002-2285-2066"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nguyen, Duc Thanh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128727249","display_name":"Truyen Tran","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tran, Truyen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5128732302"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.7633000016212463,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.7633000016212463,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10481","display_name":"Computer Graphics and Visualization Techniques","score":0.026499999687075615,"subfield":{"id":"https://openalex.org/subfields/1704","display_name":"Computer Graphics and Computer-Aided Design"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.021700000390410423,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/video-quality","display_name":"Video quality","score":0.487199991941452},{"id":"https://openalex.org/keywords/coherence","display_name":"Coherence (philosophical gambling strategy)","score":0.44020000100135803},{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.3684999942779541},{"id":"https://openalex.org/keywords/video-denoising","display_name":"Video denoising","score":0.3668000102043152},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.3596999943256378},{"id":"https://openalex.org/keywords/video-processing","display_name":"Video processing","score":0.33480000495910645},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.33309999108314514}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7533000111579895},{"id":"https://openalex.org/C103910844","wikidata":"https://www.wikidata.org/wiki/Q2631256","display_name":"Video quality","level":3,"score":0.487199991941452},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.45980000495910645},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.44020000100135803},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4374000132083893},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.399399995803833},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.3684999942779541},{"id":"https://openalex.org/C30814859","wikidata":"https://www.wikidata.org/wiki/Q4119603","display_name":"Video denoising","level":5,"score":0.3668000102043152},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.3596999943256378},{"id":"https://openalex.org/C65483669","wikidata":"https://www.wikidata.org/wiki/Q3536669","display_name":"Video processing","level":2,"score":0.33480000495910645},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.33309999108314514},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.32710000872612},{"id":"https://openalex.org/C42355184","wikidata":"https://www.wikidata.org/wiki/Q1361088","display_name":"Matrix decomposition","level":3,"score":0.3163999915122986},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.3095000088214874},{"id":"https://openalex.org/C106030495","wikidata":"https://www.wikidata.org/wiki/Q1797012","display_name":"Video compression picture types","level":4,"score":0.2831999957561493},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.27090001106262207},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2700999975204468},{"id":"https://openalex.org/C3261483","wikidata":"https://www.wikidata.org/wiki/Q119565","display_name":"Frame rate","level":2,"score":0.2687000036239624},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.26570001244544983},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.2624000012874603}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.09721","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09721","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.09721","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09721","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"High-fidelity":[0],"video":[1,19,25,151,159],"generation":[2,152],"remains":[3],"challenging":[4],"for":[5],"diffusion":[6,20],"models":[7],"due":[8],"to":[9,110,134,165],"the":[10,47,55],"difficulty":[11],"of":[12,29],"modeling":[13],"complex":[14],"spatio-temporal":[15,30,106],"dynamics":[16],"efficiently.":[17],"Recent":[18],"methods":[21],"typically":[22],"represent":[23],"a":[24,27,44,71,82,116],"as":[26,81],"sequence":[28],"tokens":[31],"which":[32,126],"can":[33],"be":[34],"modeled":[35],"using":[36],"Diffusion":[37],"Transformers":[38],"(DiTs).":[39],"However,":[40],"this":[41,65],"approach":[42],"faces":[43],"trade-off":[45],"between":[46],"strong":[48],"but":[49,57],"expensive":[50],"Full":[51],"3D":[52],"Attention":[53,102,129,133],"and":[54,84,88,108,122,138,158],"efficient":[56],"temporally":[58],"limited":[59],"Local":[60,131,166],"Factorized":[61,132,167],"Attention.":[62,168],"To":[63],"resolve":[64],"trade-off,":[66],"we":[67],"propose":[68],"Matrix":[69,101,128],"Attention,":[70],"frame-level":[72],"temporal":[73,156],"attention":[74],"mechanism":[75],"that":[76,144],"processes":[77],"an":[78],"entire":[79],"frame":[80],"matrix":[83],"generates":[85],"query,":[86],"key,":[87],"value":[89],"matrices":[90],"via":[91],"matrix-native":[92],"operations.":[93],"By":[94],"attending":[95],"across":[96,149],"frames":[97],"rather":[98],"than":[99],"tokens,":[100],"effectively":[103],"preserves":[104],"global":[105],"structure":[107],"adapts":[109],"significant":[111],"motion.":[112,140],"We":[113],"build":[114],"FrameDiT-G,":[115],"DiT":[117],"architecture":[118],"based":[119],"on":[120],"MatrixAttention,":[121],"further":[123],"introduce":[124],"FrameDiT-H,":[125],"integrates":[127],"with":[130],"capture":[135],"both":[136],"large":[137],"small":[139],"Extensive":[141],"experiments":[142],"show":[143],"FrameDiT-H":[145],"achieves":[146],"state-of-the-art":[147],"results":[148],"multiple":[150],"benchmarks,":[153],"offering":[154],"improved":[155],"coherence":[157],"quality":[160],"while":[161],"maintaining":[162],"efficiency":[163],"comparable":[164]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-03-12T00:00:00"}
