{"id":"https://openalex.org/W4414360106","doi":"https://doi.org/10.24963/ijcai.2025/114","title":"DepthART: Monocular Depth Estimation as Autoregressive Refinement Task","display_name":"DepthART: Monocular Depth Estimation as Autoregressive Refinement Task","publication_year":2025,"publication_date":"2025-09-01","ids":{"openalex":"https://openalex.org/W4414360106","doi":"https://doi.org/10.24963/ijcai.2025/114"},"language":"en","primary_location":{"id":"doi:10.24963/ijcai.2025/114","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/114","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5086074077","display_name":"B. M. Gabdullin","orcid":null},"institutions":[{"id":"https://openalex.org/I118501908","display_name":"National Research University Higher School of Economics","ror":"https://ror.org/055f7t516","country_code":"RU","type":"education","lineage":["https://openalex.org/I118501908"]}],"countries":["RU"],"is_corresponding":false,"raw_author_name":"Bulat Gabdullin","raw_affiliation_strings":["AIRI, Moscow, Russia","HSE University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"AIRI, Moscow, Russia","institution_ids":[]},{"raw_affiliation_string":"HSE University","institution_ids":["https://openalex.org/I118501908"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008981712","display_name":"Nina Konovalova","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nina Konovalova","raw_affiliation_strings":["AIRI, Moscow, Russia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"AIRI, Moscow, Russia","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055218325","display_name":"Nikolay Patakin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nikolay Patakin","raw_affiliation_strings":["AIRI, Moscow, Russia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"AIRI, Moscow, Russia","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050618007","display_name":"Dmitry Senushkin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dmitry Senushkin","raw_affiliation_strings":["AIRI, Moscow, Russia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"AIRI, Moscow, Russia","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5065787419","display_name":"Anton Konushin","orcid":"https://orcid.org/0000-0002-6152-0021"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Anton Konushin","raw_affiliation_strings":["AIRI, Moscow, Russia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"AIRI, Moscow, Russia","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.32061035,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1017","last_page":"1025"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12111","display_name":"Industrial Vision Systems and Defect Detection","score":0.9930999875068665,"subfield":{"id":"https://openalex.org/subfields/2209","display_name":"Industrial and Manufacturing Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12111","display_name":"Industrial Vision Systems and Defect Detection","score":0.9930999875068665,"subfield":{"id":"https://openalex.org/subfields/2209","display_name":"Industrial and Manufacturing Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9851999878883362,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10638","display_name":"Optical measurement and interference techniques","score":0.9758999943733215,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.7889000177383423},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.7595999836921692},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.630299985408783},{"id":"https://openalex.org/keywords/prior-probability","display_name":"Prior probability","score":0.5343000292778015},{"id":"https://openalex.org/keywords/monocular","display_name":"Monocular","score":0.5178999900817871},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.4097000062465668},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.397599995136261},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.37959998846054077}],"concepts":[{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.7889000177383423},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.7595999836921692},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6938999891281128},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.630299985408783},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6039000153541565},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.5343000292778015},{"id":"https://openalex.org/C65909025","wikidata":"https://www.wikidata.org/wiki/Q1945033","display_name":"Monocular","level":2,"score":0.5178999900817871},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.4097000062465668},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.397599995136261},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3961000144481659},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.37959998846054077},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.3732999861240387},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.3366999924182892},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.33640000224113464},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.32659998536109924},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2939000129699707},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2685000002384186},{"id":"https://openalex.org/C160234255","wikidata":"https://www.wikidata.org/wiki/Q812535","display_name":"Bayesian inference","level":3,"score":0.2581999897956848},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.2563999891281128},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.2542000114917755}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.24963/ijcai.2025/114","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/114","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Monocular":[0],"depth":[1,88,106,197],"estimation":[2,89,198],"has":[3],"seen":[4],"significant":[5],"advances":[6],"through":[7],"discriminative":[8,222],"approaches,":[9,54],"yet":[10],"their":[11],"performance":[12,193],"remains":[13],"constrained":[14],"by":[15,28,100],"the":[16,86,92,151,167,174,186,192,208],"limitations":[17],"of":[18,158,194],"training":[19,95,118,130,177,188],"datasets.":[20],"While":[21],"generative":[22,53,220],"approaches":[23],"have":[24,59],"addressed":[25],"this":[26,77,110],"challenge":[27],"leveraging":[29],"priors":[30],"from":[31],"internet-scale":[32],"datasets,":[33],"with":[34,131],"recent":[35],"studies":[36],"showing":[37],"state-of-the-art":[38],"results":[39,62,104,183,212],"using":[40,205],"fine-tuned":[41],"text-to-image":[42],"diffusion":[43,65],"models,":[44],"there":[45],"is":[46],"still":[47],"room":[48],"for":[49,105],"improvement.":[50],"Notably,":[51],"autoregressive":[52],"particularly":[55],"Visual":[56,81],"AutoRegressive":[57],"modeling,":[58],"demonstrated":[60],"superior":[61,211],"compared":[63,217],"to":[64,85,218],"models":[66],"in":[67,196],"conditioned":[68],"image":[69],"synthesis,":[70],"while":[71],"offering":[72],"faster":[73],"inference":[74,179],"times.":[75],"In":[76],"work,":[78],"we":[79,112,165],"apply":[80],"Autoregressive":[82,124],"Transformer":[83],"(VAR)":[84],"monocular":[87],"problem.":[90],"However,":[91],"conventional":[93],"GPT-2-style":[94],"procedure":[96],"(teacher":[97],"forcing)":[98],"inherited":[99],"VAR":[101,129,195],"yields":[102],"suboptimal":[103],"estimation.":[107],"To":[108],"address":[109],"limitation,":[111],"introduce":[113],"DepthART":[114],"-":[115],"a":[116,122,139],"novel":[117],"method":[119,137],"formulated":[120],"as":[121,155,169],"Depth":[123],"Refinement":[125],"Task.":[126],"Unlike":[127],"traditional":[128],"static":[132],"inputs":[133,156],"and":[134,178,221],"targets,":[135],"our":[136,206],"implements":[138],"dynamic":[140],"target":[141],"formulation":[142],"based":[143],"on":[144,202],"model":[145,209],"outputs,":[146],"enabling":[147],"self-refinement.":[148],"By":[149],"utilizing":[150],"model's":[152],"own":[153],"predictions":[154],"instead":[157],"ground":[159],"truth":[160],"token":[161],"maps":[162],"during":[163],"training,":[164],"frame":[166],"objective":[168],"residual":[170],"minimization,":[171],"effectively":[172],"reducing":[173],"discrepancy":[175],"between":[176],"procedures.":[180],"Our":[181],"experimental":[182],"demonstrate":[184],"that":[185],"proposed":[187],"approach":[189],"significantly":[190],"enhances":[191],"tasks.":[199],"When":[200],"trained":[201],"Hypersim":[203],"dataset":[204],"approach,":[207],"achieves":[210],"across":[213],"multiple":[214],"unseen":[215],"benchmarks":[216],"existing":[219],"baselines.":[223]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
