{"id":"https://openalex.org/W7138888355","doi":"https://doi.org/10.48550/arxiv.2603.18002","title":"Loc3R-VLM: Language-based Localization and 3D Reasoning with Vision-Language Models","display_name":"Loc3R-VLM: Language-based Localization and 3D Reasoning with Vision-Language Models","publication_year":2026,"publication_date":"2026-03-18","ids":{"openalex":"https://openalex.org/W7138888355","doi":"https://doi.org/10.48550/arxiv.2603.18002"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.18002","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18002","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.18002","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129898911","display_name":"Kevin Qu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Qu, Kevin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129956290","display_name":"Haozhe Qi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qi, Haozhe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030272656","display_name":"Mihai Dusmanu","orcid":"https://orcid.org/0000-0002-3219-1783"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dusmanu, Mihai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130128936","display_name":"Mahdi Rad","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rad, Mahdi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129997148","display_name":"Rui Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Rui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129794584","display_name":"Marc Pollefeys","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pollefeys, Marc","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5129898911"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.98580002784729,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.98580002784729,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.002199999988079071,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.00139999995008111,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.7139999866485596},{"id":"https://openalex.org/keywords/situated","display_name":"Situated","score":0.6621999740600586},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.6162999868392944},{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.5174999833106995},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4918000102043152},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.4887999892234802},{"id":"https://openalex.org/keywords/a-priori-and-a-posteriori","display_name":"A priori and a posteriori","score":0.3652999997138977},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.34040001034736633},{"id":"https://openalex.org/keywords/3d-model","display_name":"3d model","score":0.33640000224113464}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7221999764442444},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.7139999866485596},{"id":"https://openalex.org/C132829578","wikidata":"https://www.wikidata.org/wiki/Q581151","display_name":"Situated","level":2,"score":0.6621999740600586},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.659600019454956},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.6162999868392944},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.5174999833106995},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4918000102043152},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4887999892234802},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4122999906539917},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3889999985694885},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.3652999997138977},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.34040001034736633},{"id":"https://openalex.org/C3019007443","wikidata":"https://www.wikidata.org/wiki/Q568742","display_name":"3d model","level":2,"score":0.33640000224113464},{"id":"https://openalex.org/C2777897806","wikidata":"https://www.wikidata.org/wiki/Q568742","display_name":"3D modeling","level":2,"score":0.3330000042915344},{"id":"https://openalex.org/C65909025","wikidata":"https://www.wikidata.org/wiki/Q1945033","display_name":"Monocular","level":2,"score":0.3305000066757202},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.32670000195503235},{"id":"https://openalex.org/C109950114","wikidata":"https://www.wikidata.org/wiki/Q4464732","display_name":"3D reconstruction","level":2,"score":0.323199987411499},{"id":"https://openalex.org/C108882727","wikidata":"https://www.wikidata.org/wiki/Q2991685","display_name":"Solid modeling","level":2,"score":0.32280001044273376},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.3183000087738037},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.3124000132083893},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.29820001125335693},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.29030001163482666},{"id":"https://openalex.org/C104065381","wikidata":"https://www.wikidata.org/wiki/Q1002535","display_name":"Geometric modeling","level":2,"score":0.288100004196167},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.2851000130176544},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.2757999897003174},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2635999917984009},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.2587999999523163},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.25780001282691956},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.2563999891281128},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.25360000133514404}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.18002","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18002","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.18002","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18002","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.5915068984031677,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"Large":[1],"Language":[2],"Models":[3,54],"(MLLMs)":[4],"have":[5],"made":[6],"impressive":[7],"progress":[8],"in":[9,42,107,135],"connecting":[10],"vision":[11],"and":[12,21,87,105,115,138,142,147],"language,":[13],"but":[14],"they":[15],"still":[16],"struggle":[17],"with":[18,32,55],"spatial":[19,67,99,155],"understanding":[20,58],"viewpoint-aware":[22],"reasoning.":[23],"Recent":[24],"efforts":[25],"aim":[26],"to":[27,40,78,91],"augment":[28],"the":[29,84],"input":[30],"representations":[31],"geometric":[33,113],"cues":[34],"rather":[35],"than":[36],"explicitly":[37],"teaching":[38],"models":[39],"reason":[41],"3D":[43,57,109,128,149,160],"space.":[44],"We":[45],"introduce":[46],"Loc3R-VLM,":[47],"a":[48,80,108,126],"framework":[49,157],"that":[50,101,153],"equips":[51],"2D":[52],"Vision-Language":[53],"advanced":[56],"capabilities":[59],"from":[60,125],"monocular":[61],"video":[62],"input.":[63],"Inspired":[64],"by":[65],"human":[66],"cognition,":[68],"Loc3R-VLM":[69,131],"relies":[70],"on":[71,145],"two":[72],"joint":[73],"objectives:":[74],"global":[75],"layout":[76],"reconstruction":[77],"build":[79],"holistic":[81],"representation":[82],"of":[83],"scene":[85],"structure,":[86],"explicit":[88],"situation":[89],"modeling":[90],"anchor":[92],"egocentric":[93],"perspective.":[94],"These":[95],"objectives":[96],"provide":[97],"direct":[98],"supervision":[100,156],"grounds":[102],"both":[103],"perception":[104],"language":[106],"context.":[110],"To":[111],"ensure":[112],"consistency":[114],"metric-scale":[116],"alignment,":[117],"we":[118],"leverage":[119],"lightweight":[120],"camera":[121],"pose":[122],"priors":[123],"extracted":[124],"pre-trained":[127],"foundation":[129],"model.":[130],"achieves":[132],"state-of-the-art":[133],"performance":[134],"language-based":[136],"localization":[137],"outperforms":[139],"existing":[140],"2D-":[141],"video-based":[143],"approaches":[144],"situated":[146],"general":[148],"question-answering":[150],"benchmarks,":[151],"demonstrating":[152],"our":[154],"enables":[158],"strong":[159],"understanding.":[161],"Project":[162],"page:":[163],"https://kevinqu7.github.io/loc3r-vlm":[164]},"counts_by_year":[],"updated_date":"2026-03-20T20:54:20.808490","created_date":"2026-03-20T00:00:00"}
