{"id":"https://openalex.org/W7128615163","doi":"https://doi.org/10.48550/arxiv.2602.09657","title":"AutoFly: Vision-Language-Action Model for UAV Autonomous Navigation in the Wild","display_name":"AutoFly: Vision-Language-Action Model for UAV Autonomous Navigation in the Wild","publication_year":2026,"publication_date":"2026-02-10","ids":{"openalex":"https://openalex.org/W7128615163","doi":"https://doi.org/10.48550/arxiv.2602.09657"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2602.09657","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.09657","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2602.09657","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5042984551","display_name":"Xiaolou Sun","orcid":"https://orcid.org/0009-0007-3597-3872"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sun, Xiaolou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125669825","display_name":"Wufei Si","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Si, Wufei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125673831","display_name":"Wenhui Ni","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ni, Wenhui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125644143","display_name":"Yuntian Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yuntian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125680284","display_name":"Dongming Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Dongming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125662039","display_name":"Fei Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Fei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079695819","display_name":"Runwei Guan","orcid":"https://orcid.org/0000-0003-4013-2107"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guan, Runwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125625317","display_name":"He-Yang Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, He-Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125645318","display_name":"Henghui Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Henghui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125650170","display_name":"Yuan Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125625122","display_name":"Yutao Yue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yue, Yutao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125669087","display_name":"Yongming Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Yongming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125645551","display_name":"Hui Xiong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong, Hui","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":["https://openalex.org/A5042984551"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9550999999046326,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9550999999046326,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.008200000040233135,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.006099999882280827,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/obstacle-avoidance","display_name":"Obstacle avoidance","score":0.682699978351593},{"id":"https://openalex.org/keywords/obstacle","display_name":"Obstacle","score":0.6710000038146973},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.641700029373169},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.5849000215530396},{"id":"https://openalex.org/keywords/trajectory","display_name":"Trajectory","score":0.5604000091552734},{"id":"https://openalex.org/keywords/autonomous-system","display_name":"Autonomous system (mathematics)","score":0.5436999797821045},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.4749000072479248},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.45579999685287476},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.4507000148296356}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7225000262260437},{"id":"https://openalex.org/C6683253","wikidata":"https://www.wikidata.org/wiki/Q7075535","display_name":"Obstacle avoidance","level":4,"score":0.682699978351593},{"id":"https://openalex.org/C2776650193","wikidata":"https://www.wikidata.org/wiki/Q264661","display_name":"Obstacle","level":2,"score":0.6710000038146973},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.641700029373169},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.5849000215530396},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5767999887466431},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.5604000091552734},{"id":"https://openalex.org/C9628104","wikidata":"https://www.wikidata.org/wiki/Q788009","display_name":"Autonomous system (mathematics)","level":2,"score":0.5436999797821045},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5151000022888184},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.4749000072479248},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.45579999685287476},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.4507000148296356},{"id":"https://openalex.org/C81074085","wikidata":"https://www.wikidata.org/wiki/Q366872","display_name":"Motion planning","level":3,"score":0.42239999771118164},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.39899998903274536},{"id":"https://openalex.org/C2777891301","wikidata":"https://www.wikidata.org/wiki/Q3475123","display_name":"Navigation system","level":2,"score":0.39149999618530273},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.3894999921321869},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.38109999895095825},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.33489999175071716},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.32280001044273376},{"id":"https://openalex.org/C2778835581","wikidata":"https://www.wikidata.org/wiki/Q2916098","display_name":"Autonomous robot","level":4,"score":0.3224000036716461},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.31690001487731934},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.3140999972820282},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.31200000643730164},{"id":"https://openalex.org/C13687954","wikidata":"https://www.wikidata.org/wiki/Q4826847","display_name":"Autonomous agent","level":2,"score":0.29980000853538513},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.27869999408721924},{"id":"https://openalex.org/C133731056","wikidata":"https://www.wikidata.org/wiki/Q4917288","display_name":"Control engineering","level":1,"score":0.27869999408721924},{"id":"https://openalex.org/C60229501","wikidata":"https://www.wikidata.org/wiki/Q18822","display_name":"Global Positioning System","level":2,"score":0.2750999927520752},{"id":"https://openalex.org/C44154836","wikidata":"https://www.wikidata.org/wiki/Q45045","display_name":"Simulation","level":1,"score":0.27489998936653137},{"id":"https://openalex.org/C2777552389","wikidata":"https://www.wikidata.org/wiki/Q1962728","display_name":"Course (navigation)","level":2,"score":0.27489998936653137},{"id":"https://openalex.org/C16345878","wikidata":"https://www.wikidata.org/wiki/Q107472979","display_name":"Orientation (vector space)","level":2,"score":0.2671999931335449},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.25119999051094055},{"id":"https://openalex.org/C26990112","wikidata":"https://www.wikidata.org/wiki/Q6887224","display_name":"Mobile robot navigation","level":5,"score":0.25110000371932983}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2602.09657","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.09657","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2602.09657","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.09657","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-language":[0],"navigation":[1,55,165],"(VLN)":[2],"requires":[3],"intelligent":[4],"agents":[5],"to":[6,37,71,108,173,207],"navigate":[7,73],"environments":[8,52],"by":[9],"interpreting":[10],"linguistic":[11,125],"instructions":[12,36,56],"alongside":[13],"visual":[14],"observations,":[15],"serving":[16],"as":[17],"a":[18,98,114,162,201],"cornerstone":[19],"task":[20],"in":[21,50],"Embodied":[22],"AI.":[23],"Current":[24],"VLN":[25,132],"research":[26],"for":[27,92,137],"unmanned":[28],"aerial":[29],"vehicles":[30],"(UAVs)":[31],"relies":[32],"on":[33,146],"detailed,":[34],"pre-specified":[35],"guide":[38],"the":[39,169],"UAV":[40,94],"along":[41],"predetermined":[42],"routes.":[43],"However,":[44],"real-world":[45,138,154,192],"outdoor":[46],"exploration":[47],"typically":[48],"occurs":[49],"unknown":[51],"where":[53],"detailed":[54],"are":[57],"unavailable.":[58],"Instead,":[59],"only":[60],"coarse-grained":[61],"positional":[62],"or":[63],"directional":[64],"guidance":[65],"can":[66],"be":[67],"provided,":[68],"requiring":[69],"UAVs":[70],"autonomously":[72],"through":[74],"continuous":[75,182],"planning":[76],"and":[77,124,152,187,216],"obstacle":[78,183],"avoidance.":[79],"To":[80,156],"bridge":[81],"this":[82],"gap,":[83],"we":[84,160],"propose":[85],"AutoFly,":[86],"an":[87],"end-to-end":[88],"Vision-Language-Action":[89],"(VLA)":[90],"model":[91],"autonomous":[93,139,150,164,174,185],"navigation.":[95],"AutoFly":[96,199],"incorporates":[97],"pseudo-depth":[99],"encoder":[100],"that":[101,119,167,198],"derives":[102],"depth-aware":[103],"features":[104],"from":[105,142,171],"RGB":[106],"inputs":[107],"enhance":[109],"spatial":[110],"reasoning,":[111],"coupled":[112],"with":[113,127,211],"progressive":[115],"two-stage":[116],"training":[117],"strategy":[118],"effectively":[120],"aligns":[121],"visual,":[122],"depth,":[123],"representations":[126],"action":[128],"policies.":[129],"Moreover,":[130],"existing":[131],"datasets":[133],"have":[134],"fundamental":[135],"limitations":[136],"navigation,":[140],"stemming":[141],"their":[143],"heavy":[144],"reliance":[145],"explicit":[147],"instruction-following":[148,172],"over":[149],"decision-making":[151],"insufficient":[153],"data.":[155],"address":[157],"these":[158],"issues,":[159],"construct":[161],"novel":[163],"dataset":[166],"shifts":[168],"paradigm":[170],"behavior":[175],"modeling":[176],"through:":[177],"(1)":[178],"trajectory":[179],"collection":[180],"emphasizing":[181],"avoidance,":[184],"planning,":[186],"recognition":[188],"workflows;":[189],"(2)":[190],"comprehensive":[191],"data":[193],"integration.":[194],"Experimental":[195],"results":[196],"demonstrate":[197],"achieves":[200],"3.9%":[202],"higher":[203],"success":[204],"rate":[205],"compared":[206],"state-of-the-art":[208],"VLA":[209],"baselines,":[210],"consistent":[212],"performance":[213],"across":[214],"simulated":[215],"real":[217],"environments.":[218]},"counts_by_year":[],"updated_date":"2026-02-12T06:17:30.163165","created_date":"2026-02-12T00:00:00"}
