{"id":"https://openalex.org/W7160963749","doi":"https://doi.org/10.48550/arxiv.2605.09418","title":"MAG-VLAQ: Multi-modal Aerial-Ground Query Aggregation for Cross-View Place Recognition","display_name":"MAG-VLAQ: Multi-modal Aerial-Ground Query Aggregation for Cross-View Place Recognition","publication_year":2026,"publication_date":"2026-05-10","ids":{"openalex":"https://openalex.org/W7160963749","doi":"https://doi.org/10.48550/arxiv.2605.09418"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.09418","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09418","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.09418","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135988260","display_name":"Zhengyi Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Zhengyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135999570","display_name":"Yuhang Ming","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ming, Yuhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135965198","display_name":"Zhihao Zhan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhan, Zhihao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123516232","display_name":"Hanyu Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Hanyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135966408","display_name":"Javier Civera","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Civera, Javier","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135961478","display_name":"Wanzeng Kong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kong, Wanzeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.5910999774932861,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.5910999774932861,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.18549999594688416,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.08950000256299973,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5898000001907349},{"id":"https://openalex.org/keywords/robotics","display_name":"Robotics","score":0.424699991941452},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4101000130176544},{"id":"https://openalex.org/keywords/query-expansion","display_name":"Query expansion","score":0.40119999647140503},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.37279999256134033},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.35670000314712524},{"id":"https://openalex.org/keywords/differential","display_name":"Differential (mechanical device)","score":0.350600004196167},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.31119999289512634}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7317000031471252},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6087999939918518},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5898000001907349},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.424699991941452},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4101000130176544},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4047999978065491},{"id":"https://openalex.org/C99016210","wikidata":"https://www.wikidata.org/wiki/Q5488129","display_name":"Query expansion","level":2,"score":0.40119999647140503},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.37279999256134033},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.35670000314712524},{"id":"https://openalex.org/C93226319","wikidata":"https://www.wikidata.org/wiki/Q193137","display_name":"Differential (mechanical device)","level":2,"score":0.350600004196167},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.31119999289512634},{"id":"https://openalex.org/C2992525071","wikidata":"https://www.wikidata.org/wiki/Q50818671","display_name":"Federated learning","level":2,"score":0.31040000915527344},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.296099990606308},{"id":"https://openalex.org/C13336665","wikidata":"https://www.wikidata.org/wiki/Q125977","display_name":"Vector space","level":2,"score":0.27959999442100525},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.26820001006126404},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C173414695","wikidata":"https://www.wikidata.org/wiki/Q5510276","display_name":"Fusion mechanism","level":4,"score":0.2590000033378601},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.25529998540878296},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2551000118255615},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.25369998812675476},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.25279998779296875},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2524000108242035},{"id":"https://openalex.org/C51399673","wikidata":"https://www.wikidata.org/wiki/Q504027","display_name":"Lidar","level":2,"score":0.2517000138759613},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.09418","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09418","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.09418","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.09418","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multi-modal":[0],"cross-view":[1,43],"place":[2,44],"recognition":[3],"remains":[4],"a":[5,35,81],"fundamental":[6],"challenge":[7],"in":[8,183],"computer":[9],"vision":[10],"and":[11,19,25,61,88,150,161],"robotics":[12],"due":[13],"to":[14,53,126,138,147],"the":[15,118,127,134,164,177,184,191],"severe":[16],"viewpoint,":[17],"modality,":[18],"spatial-structure":[20],"discrepancies":[21],"between":[22],"ground":[23,60,71],"observations":[24],"aerial":[26,62],"references.":[27],"To":[28],"address":[29],"this":[30,116],"challenge,":[31],"we":[32,94],"present":[33],"MAG-VLAQ,":[34],"foundation-model-enhanced":[36],"query":[37,120],"aggregation":[38],"framework":[39],"for":[40,85],"multi-modal":[41,129],"aerial-ground":[42,155],"recognition.":[45],"Specifically,":[46],"our":[47,91,167,173],"approach":[48],"leverages":[49],"pre-trained":[50],"foundation":[51],"models":[52],"extract":[54],"dense":[55],"visual":[56,149],"tokens":[57,69,76],"from":[58,70,190],"both":[59],"images,":[63],"as":[64,66],"well":[65],"expressive":[67],"geometric":[68,151],"LiDAR":[72],"observations.":[73],"These":[74],"heterogeneous":[75],"are":[77,122],"then":[78],"projected":[79],"into":[80],"shared":[82],"embedding":[83],"space":[84],"cross-modal":[86],"alignment":[87],"fusion.":[89],"As":[90],"main":[92],"contribution,":[93],"propose":[95],"ODE-conditioned":[96],"VLAQ,":[97],"which":[98],"tightly":[99],"couples":[100],"neural":[101],"ordinary":[102],"differential":[103],"equations":[104],"(ODE)-based":[105],"RGB-LiDAR":[106],"fusion":[107],"with":[108,188],"vectors":[109],"of":[110,166],"locally":[111],"aggregated":[112],"queries":[113],"(VLAQ).":[114],"In":[115],"design,":[117],"VLAQ":[119],"centers":[121],"dynamically":[123],"adapted":[124],"according":[125],"fused":[128],"state.":[130],"This":[131],"mechanism":[132],"allows":[133],"final":[135],"global":[136],"descriptor":[137],"preserve":[139],"globally":[140],"learned":[141],"retrieval":[142],"prototypes":[143],"while":[144],"remaining":[145],"responsive":[146],"scene-specific":[148],"evidence,":[152],"significantly":[153],"improving":[154],"matching.":[156],"Extensive":[157],"experiments":[158],"on":[159,171],"KITTI360-AG":[160],"nuScenes-AG":[162],"validate":[163],"effectiveness":[165],"proposed":[168],"MAG-VLAQ.":[169],"Notably,":[170],"KITTI360-AG,":[172],"MAG-VLAQ":[174],"nearly":[175],"doubles":[176],"state-of-the-art":[178],"performance,":[179],"achieving":[180],"61.1":[181],"Recall@1":[182],"satellite":[185],"setting,":[186],"compared":[187],"34.5":[189],"closest":[192],"competing":[193],"approach.":[194]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-13T00:00:00"}
