{"id":"https://openalex.org/W7153187043","doi":"https://doi.org/10.48550/arxiv.2604.07705","title":"Vision-Language Navigation for Aerial Robots: Towards the Era of Large Language Models","display_name":"Vision-Language Navigation for Aerial Robots: Towards the Era of Large Language Models","publication_year":2026,"publication_date":"2026-04-09","ids":{"openalex":"https://openalex.org/W7153187043","doi":"https://doi.org/10.48550/arxiv.2604.07705"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.07705","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07705","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.07705","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006587319","display_name":"Xingyu Xia","orcid":"https://orcid.org/0009-0009-1294-5451"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xia, Xingyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109748895","display_name":"Lekai Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Lekai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133347207","display_name":"Yujie Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Yujie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133381789","display_name":"Xiaozhou Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Xiaozhou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133348448","display_name":"Hai Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Hai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133326955","display_name":"Wen Yao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Wen","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5006587319"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.97079998254776,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.97079998254776,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.006099999882280827,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.0034000000450760126,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.5080000162124634},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.4819999933242798},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.4799000024795532},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.40720000863075256},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.39809998869895935},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3959999978542328},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.39169999957084656},{"id":"https://openalex.org/keywords/aerial-imagery","display_name":"Aerial imagery","score":0.33079999685287476}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7210000157356262},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.5080000162124634},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.4819999933242798},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.4799000024795532},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.40720000863075256},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.40130001306533813},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.39809998869895935},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3959999978542328},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.39169999957084656},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3896999955177307},{"id":"https://openalex.org/C2987819851","wikidata":"https://www.wikidata.org/wiki/Q191839","display_name":"Aerial imagery","level":2,"score":0.33079999685287476},{"id":"https://openalex.org/C2777897806","wikidata":"https://www.wikidata.org/wiki/Q568742","display_name":"3D modeling","level":2,"score":0.3197999894618988},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.3188000023365021},{"id":"https://openalex.org/C2778464652","wikidata":"https://www.wikidata.org/wiki/Q309849","display_name":"Open research","level":2,"score":0.31139999628067017},{"id":"https://openalex.org/C176262533","wikidata":"https://www.wikidata.org/wiki/Q4688034","display_name":"Aerial survey","level":2,"score":0.30979999899864197},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.29919999837875366},{"id":"https://openalex.org/C133214962","wikidata":"https://www.wikidata.org/wiki/Q191839","display_name":"Aerial photography","level":2,"score":0.2930999994277954},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.27390000224113464},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.2667999863624573},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.26350000500679016},{"id":"https://openalex.org/C58642233","wikidata":"https://www.wikidata.org/wiki/Q8269924","display_name":"Taxonomy (biology)","level":2,"score":0.26170000433921814},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2556999921798706}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.07705","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07705","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.07705","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07705","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure","score":0.5942608714103699}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Aerial":[0,39,63,83,128],"vision-and-language":[1],"navigation":[2,106],"(Aerial":[3],"VLN)":[4],"aims":[5],"to":[6,12,45],"enable":[7],"unmanned":[8],"aerial":[9],"vehicles":[10],"(UAVs)":[11],"interpret":[13],"natural":[14],"language":[15,25,51],"instructions":[16],"and":[17,34,54,66,72,94,104,118,134,136,146,156,170,197],"autonomously":[18],"navigate":[19],"complex":[20],"three-dimensional":[21],"environments":[22],"by":[23],"grounding":[24],"in":[26,140,206],"visual":[27],"perception.":[28],"This":[29],"survey":[30],"provides":[31],"a":[32,87],"critical":[33],"analytical":[35],"review":[36],"of":[37,49,82,89],"the":[38,46,62,80,124,171,207,211],"VLN":[40,64,84],"field,":[41],"with":[42,201],"particular":[43],"attention":[44],"recent":[47],"integration":[48],"large":[50],"models":[52,56],"(LLMs)":[53],"vision-language":[55],"(VLMs).":[57],"We":[58,77,121,149],"first":[59],"formally":[60],"introduce":[61],"problem":[65],"define":[67],"two":[68],"interaction":[69],"paradigms:":[70],"single-instruction":[71],"dialog-based,":[73],"as":[74],"foundational":[75],"axes.":[76],"then":[78],"organize":[79],"body":[81],"methods":[85],"into":[86],"taxonomy":[88],"five":[90],"architectural":[91,159],"categories:":[92],"sequence-to-sequence":[93],"attention-based":[95],"methods,":[96,99,101,103],"end-to-end":[97,166],"LLM/VLM":[98],"hierarchical":[100,168],"multi-agent":[102],"dialog-based":[105],"methods.":[107],"For":[108],"each":[109],"category,":[110],"we":[111,175],"systematically":[112],"analyze":[113,157],"design":[114],"rationales,":[115],"technical":[116],"trade-offs,":[117,160],"reported":[119],"performance.":[120],"critically":[122],"assess":[123],"evaluation":[125],"infrastructure":[126],"for":[127],"VLN,":[129],"including":[130,161],"datasets,":[131],"simulation":[132],"platforms,":[133],"metrics,":[135],"identify":[137],"their":[138],"gaps":[139],"scale,":[141],"environmental":[142],"diversity,":[143],"real-world":[144],"grounding,":[145,183],"metric":[147],"coverage.":[148],"consolidate":[150],"cross-method":[151],"comparisons":[152],"on":[153],"shared":[154],"benchmarks":[155],"key":[158],"discrete":[162],"versus":[163,167],"continuous":[164,189],"actions,":[165],"designs,":[169],"simulation-to-reality":[172],"gap.":[173],"Finally,":[174],"synthesize":[176],"seven":[177],"concrete":[178],"open":[179],"problems:":[180],"long-horizon":[181],"instruction":[182],"viewpoint":[184],"robustness,":[185],"scalable":[186],"spatial":[187],"representation,":[188],"6-DoF":[190],"action":[191],"execution,":[192],"onboard":[193],"deployment,":[194],"benchmark":[195],"standardization,":[196],"multi-UAV":[198],"swarm":[199],"navigation,":[200],"specific":[202],"research":[203],"directions":[204],"grounded":[205],"evidence":[208],"presented":[209],"throughout":[210],"survey.":[212]},"counts_by_year":[],"updated_date":"2026-04-11T06:19:08.300824","created_date":"2026-04-11T00:00:00"}
