{"id":"https://openalex.org/W4414856443","doi":"https://doi.org/10.1109/tmm.2025.3618540","title":"Tuning-Free High-Resolution Video Diffusion With Spatial-Temporal Latent Grouping","display_name":"Tuning-Free High-Resolution Video Diffusion With Spatial-Temporal Latent Grouping","publication_year":2025,"publication_date":"2025-10-06","ids":{"openalex":"https://openalex.org/W4414856443","doi":"https://doi.org/10.1109/tmm.2025.3618540"},"language":"en","primary_location":{"id":"doi:10.1109/tmm.2025.3618540","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3618540","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5115053622","display_name":"Zhikai Chen","orcid":"https://orcid.org/0009-0000-5398-2026"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhikai Chen","raw_affiliation_strings":["MoE Key Laboratory of Brain-inspired Intelligent Perception and Cognition, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"MoE Key Laboratory of Brain-inspired Intelligent Perception and Cognition, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036041413","display_name":"Fuchen Long","orcid":"https://orcid.org/0000-0003-0818-0985"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fuchen Long","raw_affiliation_strings":["HiDream.ai Inc., Beijing, China","HiDream.ai Inc., China"],"affiliations":[{"raw_affiliation_string":"HiDream.ai Inc., Beijing, China","institution_ids":[]},{"raw_affiliation_string":"HiDream.ai Inc., China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024833177","display_name":"Zhaofan Qiu","orcid":"https://orcid.org/0000-0002-7485-9198"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhaofan Qiu","raw_affiliation_strings":["HiDream.ai Inc., Beijing, China","HiDream.ai Inc., China"],"affiliations":[{"raw_affiliation_string":"HiDream.ai Inc., Beijing, China","institution_ids":[]},{"raw_affiliation_string":"HiDream.ai Inc., China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088760097","display_name":"Ting Yao","orcid":"https://orcid.org/0000-0001-7587-101X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ting Yao","raw_affiliation_strings":["HiDream.ai Inc., Beijing, China","HiDream.ai Inc., China"],"affiliations":[{"raw_affiliation_string":"HiDream.ai Inc., Beijing, China","institution_ids":[]},{"raw_affiliation_string":"HiDream.ai Inc., China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101717914","display_name":"Wengang Zhou","orcid":"https://orcid.org/0000-0003-4776-3964"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wengang Zhou","raw_affiliation_strings":["MoE Key Laboratory of Brain-inspired Intelligent Perception and Cognition, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"MoE Key Laboratory of Brain-inspired Intelligent Perception and Cognition, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055469774","display_name":"Jiebo Luo","orcid":"https://orcid.org/0000-0002-4516-9729"},"institutions":[{"id":"https://openalex.org/I5388228","display_name":"University of Rochester","ror":"https://ror.org/022kthw22","country_code":"US","type":"education","lineage":["https://openalex.org/I5388228"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jiebo Luo","raw_affiliation_strings":["University of Rochester, Rochester, NY, USA","University of Rochester, New York, USA"],"affiliations":[{"raw_affiliation_string":"University of Rochester, Rochester, NY, USA","institution_ids":["https://openalex.org/I5388228"]},{"raw_affiliation_string":"University of Rochester, New York, USA","institution_ids":["https://openalex.org/I5388228"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017597537","display_name":"Tao Mei","orcid":"https://orcid.org/0000-0003-2497-7732"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tao Mei","raw_affiliation_strings":["HiDream.ai Inc., Beijing, China","HiDream.ai Inc., China"],"affiliations":[{"raw_affiliation_string":"HiDream.ai Inc., Beijing, China","institution_ids":[]},{"raw_affiliation_string":"HiDream.ai Inc., China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5115053622"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.27227268,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"28","issue":null,"first_page":"42","last_page":"56"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11105","display_name":"Advanced Image Processing Techniques","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11105","display_name":"Advanced Image Processing Techniques","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10688","display_name":"Image and Signal Denoising Methods","score":0.9954000115394592,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.5135999917984009},{"id":"https://openalex.org/keywords/video-denoising","display_name":"Video denoising","score":0.5098000168800354},{"id":"https://openalex.org/keywords/video-compression-picture-types","display_name":"Video compression picture types","score":0.42570000886917114},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4239000082015991},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.40610000491142273},{"id":"https://openalex.org/keywords/motion-compensation","display_name":"Motion compensation","score":0.3846000134944916},{"id":"https://openalex.org/keywords/reference-frame","display_name":"Reference frame","score":0.38440001010894775},{"id":"https://openalex.org/keywords/video-processing","display_name":"Video processing","score":0.35749998688697815}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8255000114440918},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6215000152587891},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.521399974822998},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.5135999917984009},{"id":"https://openalex.org/C30814859","wikidata":"https://www.wikidata.org/wiki/Q4119603","display_name":"Video denoising","level":5,"score":0.5098000168800354},{"id":"https://openalex.org/C106030495","wikidata":"https://www.wikidata.org/wiki/Q1797012","display_name":"Video compression picture types","level":4,"score":0.42570000886917114},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4239000082015991},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.40610000491142273},{"id":"https://openalex.org/C128840427","wikidata":"https://www.wikidata.org/wiki/Q1302174","display_name":"Motion compensation","level":2,"score":0.3846000134944916},{"id":"https://openalex.org/C172849965","wikidata":"https://www.wikidata.org/wiki/Q3148875","display_name":"Reference frame","level":3,"score":0.38440001010894775},{"id":"https://openalex.org/C65483669","wikidata":"https://www.wikidata.org/wiki/Q3536669","display_name":"Video processing","level":2,"score":0.35749998688697815},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.32190001010894775},{"id":"https://openalex.org/C117090137","wikidata":"https://www.wikidata.org/wiki/Q7927977","display_name":"Video post-processing","level":5,"score":0.3208000063896179},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.31299999356269836},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.3122999966144562},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.30160000920295715},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.28290000557899475},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.2816999852657318},{"id":"https://openalex.org/C167510206","wikidata":"https://www.wikidata.org/wiki/Q2835824","display_name":"Block-matching algorithm","level":4,"score":0.28040000796318054},{"id":"https://openalex.org/C3261483","wikidata":"https://www.wikidata.org/wiki/Q119565","display_name":"Frame rate","level":2,"score":0.27459999918937683},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.2687000036239624},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.25619998574256897}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2025.3618540","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3618540","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":44,"referenced_works":["https://openalex.org/W2034869517","https://openalex.org/W2952435096","https://openalex.org/W3109908659","https://openalex.org/W3176148916","https://openalex.org/W3206627384","https://openalex.org/W3212516020","https://openalex.org/W4214745154","https://openalex.org/W4282964635","https://openalex.org/W4283712710","https://openalex.org/W4312445951","https://openalex.org/W4312800447","https://openalex.org/W4312933730","https://openalex.org/W4312933868","https://openalex.org/W4362468962","https://openalex.org/W4380303706","https://openalex.org/W4382462760","https://openalex.org/W4386065764","https://openalex.org/W4386071957","https://openalex.org/W4386075767","https://openalex.org/W4386083014","https://openalex.org/W4388191297","https://openalex.org/W4388756695","https://openalex.org/W4390872556","https://openalex.org/W4390872681","https://openalex.org/W4390873054","https://openalex.org/W4390873135","https://openalex.org/W4390874113","https://openalex.org/W4391547560","https://openalex.org/W4391547658","https://openalex.org/W4393153503","https://openalex.org/W4393207111","https://openalex.org/W4402667895","https://openalex.org/W4402698357","https://openalex.org/W4402702958","https://openalex.org/W4402727496","https://openalex.org/W4402733581","https://openalex.org/W4402775841","https://openalex.org/W4403277810","https://openalex.org/W4403841908","https://openalex.org/W4403844572","https://openalex.org/W4404690153","https://openalex.org/W4404965692","https://openalex.org/W4405361369","https://openalex.org/W4407881938"],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,152,192],"text-to-video":[3],"generation":[4],"have":[5],"demonstrated":[6],"the":[7,28,58,77,95,123,137,156,175,181,189,193,233,237],"substantial":[8],"superiority":[9],"of":[10,61,80,125,160,183,205,214,239],"diffusion":[11,34,45,66,140],"models.":[12],"Nevertheless,":[13],"generating":[14],"high-resolution":[15],"videos":[16],"based":[17],"on":[18,174,232],"text":[19,147,161],"description":[20],"still":[21],"faces":[22],"a":[23,42,62,84],"great":[24],"challenge":[25],"due":[26],"to":[27,87,154,166,179],"enormous":[29],"computation":[30],"overhead":[31],"for":[32,52,68,142,225],"video":[33,44,54,65,70,81,97,139,215,227],"model":[35,67,141],"training.":[36],"In":[37,208],"this":[38],"paper,":[39],"we":[40],"present":[41],"tuning-free":[43],"approach":[46,241],"with":[47,188],"Spatial-Temporal":[48],"LAtent":[49],"Grouping":[50,107,112],"(ST-LAG),":[51],"highresolution":[53,226],"generation.":[55,171,228],"ST-LAG":[56,93],"exploits":[57],"prior":[59],"knowledge":[60],"pre-trained":[63],"low-resolution":[64,138],"regionwise":[69],"latent":[71,82,124,144,177,216],"denoising,":[72],"and":[73,109,116,132,163,219,243],"then":[74,133],"combines":[75],"all":[76,212],"denoised":[78,185],"regions":[79,165],"as":[83],"whole":[85,96],"one":[86],"achieve":[88],"global-wise":[89],"spatial-temporal":[90],"coherence.":[91],"Specifically,":[92],"denoises":[94],"latents":[98],"via":[99],"two":[100],"deliberately":[101],"designed":[102],"modules,":[103],"e.g.,":[104],"Spatial":[105],"Latent":[106,111],"(SLG)":[108],"Temporal":[110],"(TLG),":[113],"at":[114,217],"spatial":[115,164,218],"temporal":[117,199,220],"level,":[118],"respectively.":[119],"SLG":[120,153],"spatially":[121],"slices":[122],"each":[126,184,209],"frame":[127,190],"into":[128,136],"different":[129],"local":[130,186],"patches,":[131],"feeds":[134],"them":[135],"local-region":[143],"denoising.":[145],"A":[146],"re-weighting":[148],"scheme":[149],"is":[150],"devised":[151],"strength":[155],"cross-attention":[157],"between":[158],"features":[159],"tokens":[162],"facilitate":[167],"spatial-level":[168],"finegrained":[169],"details":[170],"TLG":[172],"capitalizes":[173],"segmentlevel":[176],"grouping":[178],"match":[180],"length":[182],"segment":[187],"number":[191],"training":[194],"stage.":[195],"The":[196],"well":[197],"aligned":[198],"receptive":[200],"field":[201],"facilitates":[202],"better":[203],"preservation":[204],"motion":[206],"patterns.":[207],"denoising":[210],"step,":[211],"groups":[213],"levels":[221],"are":[222],"fused":[223],"together":[224],"Extensive":[229],"experiments":[230],"conducted":[231],"ECTV-Prompt":[234],"dataset":[235],"demonstrate":[236],"effectiveness":[238],"our":[240],"quantitatively":[242],"qualitatively.":[244]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
