{"id":"https://openalex.org/W7154990801","doi":"https://doi.org/10.48550/arxiv.2604.16248","title":"Where Do Vision-Language Models Fail? World Scale Analysis for Image Geolocalization","display_name":"Where Do Vision-Language Models Fail? World Scale Analysis for Image Geolocalization","publication_year":2026,"publication_date":"2026-04-17","ids":{"openalex":"https://openalex.org/W7154990801","doi":"https://doi.org/10.48550/arxiv.2604.16248"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.16248","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.16248","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.16248","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113413172","display_name":"Siddhant Bharadwaj","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Bharadwaj, Siddhant","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128143652","display_name":"Ashish Vashist","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vashist, Ashish","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055798410","display_name":"Fahimul Aleem","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Aleem, Fahimul","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5072711888","display_name":"Shruti Vyas","orcid":"https://orcid.org/0000-0002-5591-5086"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vyas, Shruti","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5113413172"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5378000140190125,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5378000140190125,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.24410000443458557,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10757","display_name":"Geographic Information Systems Studies","score":0.03220000118017197,"subfield":{"id":"https://openalex.org/subfields/3305","display_name":"Geography, Planning and Development"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5575000047683716},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5551999807357788},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.42080000042915344},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.4205999970436096},{"id":"https://openalex.org/keywords/intersection","display_name":"Intersection (aeronautics)","score":0.41830000281333923},{"id":"https://openalex.org/keywords/global-positioning-system","display_name":"Global Positioning System","score":0.4000999927520752},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.36959999799728394}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6111000180244446},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5845000147819519},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5575000047683716},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5551999807357788},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.42080000042915344},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4207000136375427},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.4205999970436096},{"id":"https://openalex.org/C64543145","wikidata":"https://www.wikidata.org/wiki/Q162942","display_name":"Intersection (aeronautics)","level":2,"score":0.41830000281333923},{"id":"https://openalex.org/C60229501","wikidata":"https://www.wikidata.org/wiki/Q18822","display_name":"Global Positioning System","level":2,"score":0.4000999927520752},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.36959999799728394},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.3398999869823456},{"id":"https://openalex.org/C41856607","wikidata":"https://www.wikidata.org/wiki/Q483130","display_name":"Geographic information system","level":2,"score":0.3361999988555908},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.32109999656677246},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.30329999327659607},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.29120001196861267},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2840999960899353},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.2718999981880188},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.2696000039577484},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.2623000144958496},{"id":"https://openalex.org/C9770341","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Geospatial analysis","level":2,"score":0.26100000739097595},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.25780001282691956}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.16248","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.16248","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.16248","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.16248","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Image":[0],"geolocalization":[1,53,110,134],"has":[2],"traditionally":[3],"been":[4],"addressed":[5],"through":[6],"retrieval-based":[7],"place":[8],"recognition":[9],"or":[10,66],"geometry-based":[11],"visual":[12],"localization":[13],"pipelines.":[14],"Recent":[15],"advances":[16],"in":[17,33,74,117],"Vision-Language":[18],"Models":[19],"(VLMs)":[20],"have":[21],"demonstrated":[22],"strong":[23],"zero-shot":[24,76],"reasoning":[25,107,147],"capabilities":[26],"across":[27,100],"multimodal":[28,146],"tasks,":[29],"yet":[30],"their":[31,90],"performance":[32],"geographic":[34,120,149],"inference":[35],"remains":[36],"underexplored.":[37],"In":[38],"this":[39],"work,":[40],"we":[41,69],"present":[42],"a":[43,75,137],"systematic":[44],"evaluation":[45],"of":[46,59,105,114,129,145],"multiple":[47],"state-of-the-art":[48],"VLMs":[49,116,131],"for":[50,108,132,139],"country-level":[51,133],"image":[52,62],"using":[54],"ground-view":[55],"imagery":[56],"only.":[57],"Instead":[58],"relying":[60],"on":[61,83],"matching,":[63],"GPS":[64],"metadata,":[65],"task-specific":[67],"training,":[68],"evaluate":[70],"prompt-based":[71],"country":[72],"prediction":[73],"setting.":[77],"The":[78],"selected":[79],"models":[80],"are":[81],"tested":[82],"three":[84],"geographically":[85],"diverse":[86],"datasets":[87],"to":[88],"assess":[89],"robustness":[91],"and":[92,111,135,148],"generalization":[93],"ability.":[94],"Our":[95],"results":[96],"reveal":[97],"substantial":[98],"variation":[99],"models,":[101],"highlighting":[102],"the":[103,112,125,143],"potential":[104],"semantic":[106],"coarse":[109],"limitations":[113],"current":[115],"capturing":[118],"fine-grained":[119],"cues.":[121],"This":[122],"study":[123],"provides":[124],"first":[126],"focused":[127],"comparison":[128],"modern":[130],"establishes":[136],"foundation":[138],"future":[140],"research":[141],"at":[142],"intersection":[144],"understanding.":[150]},"counts_by_year":[],"updated_date":"2026-04-21T06:12:34.886580","created_date":"2026-04-21T00:00:00"}
