{"id":"https://openalex.org/W4417183121","doi":"https://doi.org/10.1109/iccv51701.2025.00514","title":"DepthSync: Diffusion Guidance-Based Depth Synchronization for Scale- and Geometry-Consistent Video Depth Estimation","display_name":"DepthSync: Diffusion Guidance-Based Depth Synchronization for Scale- and Geometry-Consistent Video Depth Estimation","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4417183121","doi":"https://doi.org/10.1109/iccv51701.2025.00514"},"language":"en","primary_location":{"id":"doi:10.1109/iccv51701.2025.00514","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.00514","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2507.01603","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5053578088","display_name":"Yue-Jiang Dong","orcid":"https://orcid.org/0009-0007-3096-9823"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yue-Jiang Dong","raw_affiliation_strings":["Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100434511","display_name":"Zhao Wang","orcid":"https://orcid.org/0000-0001-5979-4026"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wang Zhao","raw_affiliation_strings":["Tencent PCG,ARC Lab"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tencent PCG,ARC Lab","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013851053","display_name":"Jingxiang Xu","orcid":"https://orcid.org/0000-0002-1484-9692"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiale Xu","raw_affiliation_strings":["Tencent PCG,ARC Lab"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tencent PCG,ARC Lab","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102004349","display_name":"Ying Shan","orcid":"https://orcid.org/0000-0001-7673-8325"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ying Shan","raw_affiliation_strings":["Tencent PCG,ARC Lab"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tencent PCG,ARC Lab","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5049883689","display_name":"Song\u2013Hai Zhang","orcid":"https://orcid.org/0000-0003-0460-1586"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Song-Hai Zhang","raw_affiliation_strings":["Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.35194345,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"5415","last_page":"5425"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.8335000276565552,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.8335000276565552,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10741","display_name":"Video Coding and Compression Technologies","score":0.0812000036239624,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.021299999207258224,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.5875999927520752},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5737000107765198},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.5541999936103821},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5498999953269958},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.49219998717308044},{"id":"https://openalex.org/keywords/diffusion-process","display_name":"Diffusion process","score":0.3431999981403351},{"id":"https://openalex.org/keywords/measure","display_name":"Measure (data warehouse)","score":0.3192000091075897},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.30720001459121704}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6413999795913696},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.5875999927520752},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5737000107765198},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.5541999936103821},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5498999953269958},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5335000157356262},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.49219998717308044},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42089998722076416},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.39980000257492065},{"id":"https://openalex.org/C68710425","wikidata":"https://www.wikidata.org/wiki/Q5275442","display_name":"Diffusion process","level":3,"score":0.3431999981403351},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.3192000091075897},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.30720001459121704},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.30309998989105225},{"id":"https://openalex.org/C96250715","wikidata":"https://www.wikidata.org/wiki/Q965330","display_name":"Estimation","level":2,"score":0.29600000381469727},{"id":"https://openalex.org/C167928553","wikidata":"https://www.wikidata.org/wiki/Q1376021","display_name":"Estimation theory","level":2,"score":0.29350000619888306},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.2928999960422516},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.28780001401901245},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.28529998660087585},{"id":"https://openalex.org/C141268832","wikidata":"https://www.wikidata.org/wiki/Q2940499","display_name":"Depth map","level":3,"score":0.28049999475479126},{"id":"https://openalex.org/C113346285","wikidata":"https://www.wikidata.org/wiki/Q6804193","display_name":"Measured depth","level":2,"score":0.2761000096797943},{"id":"https://openalex.org/C65483669","wikidata":"https://www.wikidata.org/wiki/Q3536669","display_name":"Video processing","level":2,"score":0.265500009059906},{"id":"https://openalex.org/C84824328","wikidata":"https://www.wikidata.org/wiki/Q4633097","display_name":"2D to 3D conversion","level":3,"score":0.2524999976158142}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/iccv51701.2025.00514","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.00514","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2507.01603","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2507.01603","pdf_url":"https://arxiv.org/pdf/2507.01603","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2507.01603","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2507.01603","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2507.01603","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2507.01603","pdf_url":"https://arxiv.org/pdf/2507.01603","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Diffusion-based":[0],"video":[1,61,120],"depth":[2,15,87,100,133,147],"estimation":[3],"methods":[4,22,47],"have":[5],"achieved":[6],"remarkable":[7],"success":[8],"with":[9,149],"strong":[10],"generalization":[11],"ability.":[12],"However,":[13],"predicting":[14],"for":[16,89,156],"long":[17,90,157],"videos":[18,25],"remains":[19],"challenging.":[20],"Existing":[21],"typically":[23],"split":[24],"into":[26],"overlapping":[27],"sliding":[28],"windows,":[29,37],"leading":[30],"to":[31,82,97,107],"accumulated":[32],"scale":[33,95,101,151],"discrepancies":[34],"across":[35,102],"different":[36],"particularly":[38,155],"as":[39],"the":[40,55,99,115,128,140],"number":[41],"of":[42,60,142],"windows":[43,103,112],"increases.":[44],"Additionally,":[45],"these":[46],"rely":[48],"solely":[49],"on":[50,114,136],"2D":[51],"diffusion":[52,80],"priors,":[53],"overlooking":[54],"inherent":[56,116],"3D":[57,117],"geometric":[58,109],"structure":[59],"depths,":[62],"which":[63],"results":[64],"in":[65,119,145],"geometrically":[66],"inconsistent":[67],"predictions.":[68,134],"In":[69],"this":[70],"paper,":[71],"we":[72,93],"propose":[73],"DepthSync,":[74],"a":[75],"novel,":[76],"training-free":[77],"framework":[78],"using":[79],"guidance":[81,96,106],"achieve":[83],"scale-":[84],"and":[85,104,152],"geometry-consistent":[86],"predictions":[88],"videos.":[91,158],"Specifically,":[92],"introduce":[94],"synchronize":[98],"geometry":[105,153],"enforce":[108],"alignment":[110],"within":[111],"based":[113],"constraints":[118],"depths.":[121],"These":[122],"two":[123],"terms":[124],"work":[125],"synergistically,":[126],"steering":[127],"denoising":[129],"process":[130],"toward":[131],"consistent":[132],"Experiments":[135],"various":[137],"datasets":[138],"validate":[139],"effectiveness":[141],"our":[143],"method":[144],"producing":[146],"estimates":[148],"improved":[150],"consistency,":[154]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
