{"id":"https://openalex.org/W4390722773","doi":"https://doi.org/10.48550/arxiv.2401.03048","title":"Latte: Latent Diffusion Transformer for Video Generation","display_name":"Latte: Latent Diffusion Transformer for Video Generation","publication_year":2024,"publication_date":"2024-01-05","ids":{"openalex":"https://openalex.org/W4390722773","doi":"https://doi.org/10.48550/arxiv.2401.03048"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2401.03048","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2401.03048","pdf_url":"https://arxiv.org/pdf/2401.03048","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2401.03048","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101098273","display_name":"Xin Ma","orcid":"https://orcid.org/0000-0001-5240-1867"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ma, Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100672233","display_name":"Yaohui Wang","orcid":"https://orcid.org/0009-0002-6262-7450"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yaohui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084003247","display_name":"Gengyun Jia","orcid":"https://orcid.org/0000-0002-0513-138X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Xinyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100664512","display_name":"Xinyuan Chen","orcid":"https://orcid.org/0000-0002-5517-7255"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jia, Gengyun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100406050","display_name":"Ziwei Liu","orcid":"https://orcid.org/0000-0002-4220-5958"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ziwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017943466","display_name":"Yuan-Fang Li","orcid":"https://orcid.org/0000-0003-4651-2821"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yuan-Fang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072508861","display_name":"Cunjian Chen","orcid":"https://orcid.org/0000-0002-2926-9762"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Cunjian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101371688","display_name":"Yu Qiao","orcid":"https://orcid.org/0000-0001-8932-7621"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiao, Yu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5101098273"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":11,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9934999942779541,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9934999942779541,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.7990126013755798},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.773353099822998},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7345519065856934},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4720669090747833},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3385053873062134},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.33134979009628296},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.0925683081150055}],"concepts":[{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7990126013755798},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.773353099822998},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7345519065856934},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4720669090747833},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3385053873062134},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.33134979009628296},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0925683081150055},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2401.03048","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2401.03048","pdf_url":"https://arxiv.org/pdf/2401.03048","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2401.03048","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2401.03048","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2401.03048","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2401.03048","pdf_url":"https://arxiv.org/pdf/2401.03048","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"score":0.41999998688697815,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2772917594","https://openalex.org/W2775347418","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"We":[0,142],"propose":[1],"Latte,":[2],"a":[3,22,39],"novel":[4],"Latent":[5],"Diffusion":[6],"Transformer":[7,25],"for":[8,150,159],"video":[9,29,84,111,160],"generation.":[10,161],"Latte":[11,78,104,124,132,146],"first":[12],"extracts":[13],"spatio-temporal":[14],"tokens":[15,43],"from":[16,45,52],"input":[17,63],"videos":[18],"and":[19,59,96,118],"then":[20],"adopts":[21],"series":[23],"of":[24,42,55,62,69,77],"blocks":[26],"to":[27,37,125],"model":[28,38,88],"distribution":[30],"in":[31],"the":[32,53,57,67,74,126],"latent":[33],"space.":[34],"In":[35,120],"order":[36],"substantial":[40],"number":[41],"extracted":[44],"videos,":[46,71],"four":[47,109],"efficient":[48],"variants":[49],"are":[50,136],"introduced":[51],"perspective":[54],"decomposing":[56],"spatial":[58],"temporal":[60,93],"dimensions":[61],"videos.":[64],"To":[65],"improve":[66],"quality":[68],"generated":[70],"we":[72,122],"determine":[73],"best":[75],"practices":[76],"through":[79],"rigorous":[80],"experimental":[81],"analysis,":[82],"including":[83],"clip":[85],"patch":[86],"embedding,":[87,95],"variants,":[89],"timestep-class":[90],"information":[91],"injection,":[92],"positional":[94],"learning":[97],"strategies.":[98],"Our":[99],"comprehensive":[100],"evaluation":[101],"demonstrates":[102],"that":[103,135,145],"achieves":[105,133],"state-of-the-art":[106],"performance":[107],"across":[108],"standard":[110],"generation":[112,128],"datasets,":[113],"i.e.,":[114],"FaceForensics,":[115],"SkyTimelapse,":[116],"UCF101,":[117],"Taichi-HD.":[119],"addition,":[121],"extend":[123],"text-to-video":[127],"(T2V)":[129],"task,":[130],"where":[131],"results":[134],"competitive":[137],"with":[138],"recent":[139],"T2V":[140],"models.":[141],"strongly":[143],"believe":[144],"provides":[147],"valuable":[148],"insights":[149],"future":[151],"research":[152],"on":[153],"incorporating":[154],"Transformers":[155],"into":[156],"diffusion":[157],"models":[158]},"counts_by_year":[{"year":2025,"cited_by_count":8},{"year":2024,"cited_by_count":3}],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2025-10-10T00:00:00"}
