{"id":"https://openalex.org/W7151375053","doi":"https://doi.org/10.48550/arxiv.2604.04055","title":"DINO-VO: Learning Where to Focus for Enhanced State Estimation","display_name":"DINO-VO: Learning Where to Focus for Enhanced State Estimation","publication_year":2026,"publication_date":"2026-04-05","ids":{"openalex":"https://openalex.org/W7151375053","doi":"https://doi.org/10.48550/arxiv.2604.04055"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.04055","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04055","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.04055","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133118383","display_name":"Qi Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Qi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133110776","display_name":"Guanghao Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Guanghao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021356046","display_name":"Sijia Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Sijia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133067789","display_name":"Xin Gao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133124428","display_name":"Junpeng Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Junpeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133137398","display_name":"Xiangyang Xue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Xiangyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133136205","display_name":"Jian Pu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pu, Jian","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5133118383"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.7272999882698059,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.7272999882698059,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.1039000004529953,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.03269999846816063,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/visual-odometry","display_name":"Visual odometry","score":0.7056999802589417},{"id":"https://openalex.org/keywords/monocular","display_name":"Monocular","score":0.6588000059127808},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.593500018119812},{"id":"https://openalex.org/keywords/odometry","display_name":"Odometry","score":0.5669000148773193},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5443999767303467},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5202000141143799},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.501800000667572},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.44749999046325684},{"id":"https://openalex.org/keywords/differentiable-function","display_name":"Differentiable function","score":0.44600000977516174},{"id":"https://openalex.org/keywords/occam","display_name":"occam","score":0.4302000105381012}],"concepts":[{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7361999750137329},{"id":"https://openalex.org/C5799516","wikidata":"https://www.wikidata.org/wiki/Q4110915","display_name":"Visual odometry","level":3,"score":0.7056999802589417},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6657000184059143},{"id":"https://openalex.org/C65909025","wikidata":"https://www.wikidata.org/wiki/Q1945033","display_name":"Monocular","level":2,"score":0.6588000059127808},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.6456999778747559},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.593500018119812},{"id":"https://openalex.org/C49441653","wikidata":"https://www.wikidata.org/wiki/Q2014717","display_name":"Odometry","level":4,"score":0.5669000148773193},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5443999767303467},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5202000141143799},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.501800000667572},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.44749999046325684},{"id":"https://openalex.org/C202615002","wikidata":"https://www.wikidata.org/wiki/Q783507","display_name":"Differentiable function","level":2,"score":0.44600000977516174},{"id":"https://openalex.org/C78469957","wikidata":"https://www.wikidata.org/wiki/Q838062","display_name":"occam","level":2,"score":0.4302000105381012},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4092000126838684},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.4077000021934509},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.3734999895095825},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.36880001425743103},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.36070001125335693},{"id":"https://openalex.org/C2983787585","wikidata":"https://www.wikidata.org/wiki/Q93586","display_name":"Feature matching","level":3,"score":0.3587000072002411},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.35420000553131104},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3328000009059906},{"id":"https://openalex.org/C52102323","wikidata":"https://www.wikidata.org/wiki/Q1671968","display_name":"Pose","level":2,"score":0.33149999380111694},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.30140000581741333},{"id":"https://openalex.org/C179458375","wikidata":"https://www.wikidata.org/wiki/Q1020763","display_name":"Bundle adjustment","level":3,"score":0.2978000044822693},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.27709999680519104},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2770000100135803},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.27649998664855957},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C87833898","wikidata":"https://www.wikidata.org/wiki/Q1060280","display_name":"Advanced driver assistance systems","level":2,"score":0.2718999981880188},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.2651999890804291},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.25999999046325684},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.2597000002861023},{"id":"https://openalex.org/C158829959","wikidata":"https://www.wikidata.org/wiki/Q1640606","display_name":"Monocular vision","level":2,"score":0.25450000166893005}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.04055","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04055","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.04055","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04055","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/11","score":0.45364877581596375,"display_name":"Sustainable cities and communities"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"present":[1],"DINO":[2],"Patch":[3],"Visual":[4,18],"Odometry":[5,19],"(DINO-VO),":[6],"an":[7],"end-to-end":[8,53],"monocular":[9],"visual":[10],"odometry":[11],"system":[12,69,90],"with":[13,76],"strong":[14,125],"scene":[15],"generalization.":[16],"Current":[17],"(VO)":[20],"systems":[21],"often":[22],"rely":[23],"on":[24,113],"heuristic":[25],"feature":[26,73,106],"extraction":[27,74],"strategies,":[28],"which":[29],"can":[30],"degrade":[31],"accuracy":[32],"and":[33,61,93,96,108,118,130],"robustness,":[34],"particularly":[35],"in":[36],"large-scale":[37],"outdoor":[38,131],"environments.":[39],"DINO-VO":[40,123],"addresses":[41],"these":[42],"limitations":[43],"by":[44],"incorporating":[45],"a":[46,71,77],"differentiable":[47,78],"adaptive":[48],"patch":[49],"selector":[50],"into":[51],"the":[52,56,89,103,114],"pipeline,":[54],"improving":[55],"quality":[57],"of":[58],"extracted":[59],"patches":[60],"enhancing":[62],"generalization":[63,126],"across":[64,127],"diverse":[65],"datasets.":[66],"Additionally,":[67],"our":[68],"integrates":[70],"multi-task":[72],"module":[75,82],"bundle":[79],"adjustment":[80],"(BA)":[81],"that":[83,122],"leverages":[84],"inverse":[85],"depth":[86],"priors,":[87],"enabling":[88],"to":[91],"learn":[92],"utilize":[94],"appearance":[95],"geometric":[97],"information":[98],"effectively.":[99],"This":[100],"integration":[101],"bridges":[102],"gap":[104],"between":[105],"learning":[107],"state":[109],"estimation.":[110],"Extensive":[111],"experiments":[112],"TartanAir,":[115],"KITTI,":[116],"Euroc,":[117],"TUM":[119],"datasets":[120],"demonstrate":[121],"exhibits":[124],"synthetic,":[128],"indoor,":[129],"environments,":[132],"achieving":[133],"state-of-the-art":[134],"tracking":[135],"accuracy.":[136]},"counts_by_year":[],"updated_date":"2026-04-08T06:07:18.267832","created_date":"2026-04-08T00:00:00"}
