{"id":"https://openalex.org/W4386230985","doi":"https://doi.org/10.1109/tmm.2023.3309559","title":"ViTA: Video Transformer Adaptor for Robust Video Depth Estimation","display_name":"ViTA: Video Transformer Adaptor for Robust Video Depth Estimation","publication_year":2023,"publication_date":"2023-08-28","ids":{"openalex":"https://openalex.org/W4386230985","doi":"https://doi.org/10.1109/tmm.2023.3309559"},"language":"en","primary_location":{"id":"doi:10.1109/tmm.2023.3309559","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2023.3309559","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5041554011","display_name":"Ke Xian","orcid":"https://orcid.org/0000-0002-0884-5126"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Ke Xian","raw_affiliation_strings":["S-Lab, Nanyang Technological University (NTU), Singapore"],"affiliations":[{"raw_affiliation_string":"S-Lab, Nanyang Technological University (NTU), Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019166957","display_name":"Juewen Peng","orcid":"https://orcid.org/0000-0001-5740-2682"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Juewen Peng","raw_affiliation_strings":["Key Laboratory of Image Processing and Intelligent Control, Ministry of Education, School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Image Processing and Intelligent Control, Ministry of Education, School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036358447","display_name":"Zhiguo Cao","orcid":"https://orcid.org/0000-0002-9223-1863"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiguo Cao","raw_affiliation_strings":["Key Laboratory of Image Processing and Intelligent Control, Ministry of Education, School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Image Processing and Intelligent Control, Ministry of Education, School of Artificial Intelligence and Automation, Huazhong University of Science and Technology, Wuhan, China","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036510129","display_name":"Jianming Zhang","orcid":"https://orcid.org/0000-0002-9954-6294"},"institutions":[{"id":"https://openalex.org/I1306409833","display_name":"Adobe Systems (United States)","ror":"https://ror.org/059tvcg64","country_code":"US","type":"company","lineage":["https://openalex.org/I1306409833"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jianming Zhang","raw_affiliation_strings":["Adobe Research, San Francisco, CA, USA"],"affiliations":[{"raw_affiliation_string":"Adobe Research, San Francisco, CA, USA","institution_ids":["https://openalex.org/I1306409833"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5029912845","display_name":"Guosheng Lin","orcid":"https://orcid.org/0000-0002-0329-7458"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Guosheng Lin","raw_affiliation_strings":["S-Lab, Nanyang Technological University (NTU), Singapore"],"affiliations":[{"raw_affiliation_string":"S-Lab, Nanyang Technological University (NTU), Singapore","institution_ids":["https://openalex.org/I172675005"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5041554011"],"corresponding_institution_ids":["https://openalex.org/I172675005"],"apc_list":null,"apc_paid":null,"fwci":0.7367,"has_fulltext":false,"cited_by_count":6,"citation_normalized_percentile":{"value":0.72978768,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":"26","issue":null,"first_page":"3302","last_page":"3316"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11105","display_name":"Advanced Image Processing Techniques","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13114","display_name":"Image Processing Techniques and Applications","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/2214","display_name":"Media Technology"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8053449988365173},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6264915466308594},{"id":"https://openalex.org/keywords/optical-flow","display_name":"Optical flow","score":0.6091320514678955},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.5965594053268433},{"id":"https://openalex.org/keywords/image-warping","display_name":"Image warping","score":0.5722634196281433},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5382189154624939},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5089550018310547},{"id":"https://openalex.org/keywords/pixel","display_name":"Pixel","score":0.4915355145931244},{"id":"https://openalex.org/keywords/ground-truth","display_name":"Ground truth","score":0.4761338531970978},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.46630680561065674},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4232648015022278},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.14282023906707764}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8053449988365173},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6264915466308594},{"id":"https://openalex.org/C155542232","wikidata":"https://www.wikidata.org/wiki/Q736111","display_name":"Optical flow","level":3,"score":0.6091320514678955},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5965594053268433},{"id":"https://openalex.org/C157202957","wikidata":"https://www.wikidata.org/wiki/Q1659609","display_name":"Image warping","level":2,"score":0.5722634196281433},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5382189154624939},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5089550018310547},{"id":"https://openalex.org/C160633673","wikidata":"https://www.wikidata.org/wiki/Q355198","display_name":"Pixel","level":2,"score":0.4915355145931244},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.4761338531970978},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.46630680561065674},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4232648015022278},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.14282023906707764},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2023.3309559","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2023.3309559","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2613911948","display_name":null,"funder_award_id":"U1913602","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320320751","display_name":"Ministry of Education - Singapore","ror":"https://ror.org/01kcva023"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":68,"referenced_works":["https://openalex.org/W125693051","https://openalex.org/W1513100184","https://openalex.org/W1970862222","https://openalex.org/W2021851106","https://openalex.org/W2074254947","https://openalex.org/W2111212222","https://openalex.org/W2156380531","https://openalex.org/W2300779272","https://openalex.org/W2471962767","https://openalex.org/W2476548250","https://openalex.org/W2520707372","https://openalex.org/W2560474170","https://openalex.org/W2561074213","https://openalex.org/W2609883120","https://openalex.org/W2751567033","https://openalex.org/W2798373498","https://openalex.org/W2941243189","https://openalex.org/W2942368658","https://openalex.org/W2955639361","https://openalex.org/W2956939277","https://openalex.org/W2962456457","https://openalex.org/W2962741876","https://openalex.org/W2963264757","https://openalex.org/W2963316641","https://openalex.org/W2963732628","https://openalex.org/W2974576232","https://openalex.org/W2982014906","https://openalex.org/W2982336692","https://openalex.org/W3034267259","https://openalex.org/W3035563424","https://openalex.org/W3048510980","https://openalex.org/W3081167590","https://openalex.org/W3094502228","https://openalex.org/W3096609285","https://openalex.org/W3109908659","https://openalex.org/W3126721948","https://openalex.org/W3132270109","https://openalex.org/W3138516171","https://openalex.org/W3173409262","https://openalex.org/W3174211490","https://openalex.org/W3174458495","https://openalex.org/W3174541782","https://openalex.org/W3174752334","https://openalex.org/W3177390622","https://openalex.org/W3182318349","https://openalex.org/W3188511781","https://openalex.org/W3205244824","https://openalex.org/W3208147236","https://openalex.org/W3215023725","https://openalex.org/W4200495456","https://openalex.org/W4214516465","https://openalex.org/W4214520160","https://openalex.org/W4250334060","https://openalex.org/W4289550868","https://openalex.org/W4312560592","https://openalex.org/W4312769131","https://openalex.org/W4313177683","https://openalex.org/W4386065828","https://openalex.org/W4386071468","https://openalex.org/W4386075825","https://openalex.org/W4386076206","https://openalex.org/W4386076394","https://openalex.org/W6680369585","https://openalex.org/W6685261749","https://openalex.org/W6703405610","https://openalex.org/W6756774061","https://openalex.org/W6771646754","https://openalex.org/W6788135285"],"related_works":["https://openalex.org/W1670332068","https://openalex.org/W2095618524","https://openalex.org/W2735770592","https://openalex.org/W1971024059","https://openalex.org/W1502062143","https://openalex.org/W4224236531","https://openalex.org/W4291993329","https://openalex.org/W2012410061","https://openalex.org/W2053610073","https://openalex.org/W2970427506"],"abstract_inverted_index":{"Depth":[0],"information":[1],"plays":[2],"a":[3,88,105,136,182,225],"pivotal":[4],"role":[5],"in":[6,25,98,116,156,170,220],"numerous":[7],"computer":[8],"vision":[9],"applications,":[10,27],"including":[11],"autonomous":[12],"driving,":[13],"3D":[14,17],"reconstruction,":[15],"and":[16,111,152,190,211],"content":[18],"generation.":[19],"When":[20],"deploying":[21],"depth":[22,42,50,69,97,215],"estimation":[23,43,70],"models":[24,35],"practical":[26],"it":[28,206],"is":[29,249],"essential":[30],"to":[31,53,61,92,125,207,229,251],"ensure":[32],"that":[33,245],"the":[34,54,65,82,99,117,146,150,153,159,163,167,175,188,231,234,240,246],"have":[36],"strong":[37],"generalization":[38],"capabilities.":[39],"However,":[40],"existing":[41],"methods":[44,71],"primarily":[45],"concentrate":[46],"on":[47,187,239],"robust":[48],"single-image":[49],"estimation,":[51,203],"leading":[52],"occurrence":[55],"of":[56,166,233],"flickering":[57],"artifacts":[58],"when":[59],"applied":[60],"video":[62,68,89,96,214],"inputs.":[63],"On":[64],"other":[66],"hand,":[67],"either":[72],"consume":[73],"excessive":[74],"computational":[75],"resources":[76],"or":[77],"lack":[78],"robustness.":[79],"To":[80,173],"address":[81],"above":[83],"issues,":[84],"we":[85,103,134,180],"propose":[86],"ViTA,":[87],"transformer":[90,108,118],"adaptor,":[91],"estimate":[93,208],"temporally":[94,212],"consistent":[95,213],"wild.":[100],"In":[101],"particular,":[102],"leverage":[104],"pre-trained":[106],"image":[107],"(i.e.,":[109],"DPT)":[110],"introduce":[112],"additional":[113],"temporal":[114,160],"embeddings":[115],"blocks.":[119],"Such":[120],"designs":[121],"enable":[122],"our":[123,196],"ViTA":[124,197],"output":[126],"reliable":[127],"results":[128],"given":[129],"an":[130],"unconstrained":[131],"video.":[132],"Besides,":[133],"present":[135],"spatio-temporal":[137],"consistency":[138],"loss":[139,144,161],"for":[140],"supervision.":[141],"The":[142],"spatial":[143],"computes":[145],"per-pixel":[147],"discrepancy":[148],"between":[149,177],"prediction":[151],"ground":[154],"truth":[155],"space,":[157],"while":[158],"regularizes":[162],"inconsistent":[164],"outputs":[165],"same":[168],"point":[169],"consecutive":[171,178],"frames.":[172],"find":[174],"correspondences":[176],"frames,":[179],"design":[181],"bi-directional":[183],"warping":[184],"strategy":[185],"based":[186],"forward":[189],"backward":[191],"optical":[192,201],"flow.":[193],"During":[194],"inference,":[195],"no":[198],"longer":[199],"requires":[200],"flow":[202],"which":[204],"enables":[205],"spatially":[209],"accurate":[210],"maps":[216],"with":[217],"fine-grained":[218],"details":[219],"real":[221],"time.":[222],"We":[223],"conduct":[224],"detailed":[226],"ablation":[227],"study":[228],"verify":[230],"effectiveness":[232],"proposed":[235,247],"components.":[236],"Extensive":[237],"experiments":[238],"zero-shot":[241],"cross-dataset":[242],"evaluation":[243],"demonstrate":[244],"method":[248],"superior":[250],"previous":[252],"methods.":[253]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-09T08:58:05.943551","created_date":"2025-10-10T00:00:00"}
