{"id":"https://openalex.org/W4405785489","doi":"https://doi.org/10.1109/iros58592.2024.10802451","title":"VANP: Learning Where to See for Navigation with Self-Supervised Vision-Action Pre-Training","display_name":"VANP: Learning Where to See for Navigation with Self-Supervised Vision-Action Pre-Training","publication_year":2024,"publication_date":"2024-10-14","ids":{"openalex":"https://openalex.org/W4405785489","doi":"https://doi.org/10.1109/iros58592.2024.10802451"},"language":"en","primary_location":{"id":"doi:10.1109/iros58592.2024.10802451","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros58592.2024.10802451","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5045302522","display_name":"Mohammad Nazeri","orcid":"https://orcid.org/0000-0003-3125-6298"},"institutions":[{"id":"https://openalex.org/I162714631","display_name":"George Mason University","ror":"https://ror.org/02jqj7156","country_code":"US","type":"education","lineage":["https://openalex.org/I162714631"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Mohammad Nazeri","raw_affiliation_strings":["George Mason University,Department of Computer Science"],"affiliations":[{"raw_affiliation_string":"George Mason University,Department of Computer Science","institution_ids":["https://openalex.org/I162714631"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101861061","display_name":"Junzhe Wang","orcid":"https://orcid.org/0000-0002-3001-8055"},"institutions":[{"id":"https://openalex.org/I162714631","display_name":"George Mason University","ror":"https://ror.org/02jqj7156","country_code":"US","type":"education","lineage":["https://openalex.org/I162714631"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Junzhe Wang","raw_affiliation_strings":["George Mason University,Department of Computer Science"],"affiliations":[{"raw_affiliation_string":"George Mason University,Department of Computer Science","institution_ids":["https://openalex.org/I162714631"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028803790","display_name":"Amirreza Payandeh","orcid":"https://orcid.org/0009-0003-3145-3096"},"institutions":[{"id":"https://openalex.org/I162714631","display_name":"George Mason University","ror":"https://ror.org/02jqj7156","country_code":"US","type":"education","lineage":["https://openalex.org/I162714631"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Amirreza Payandeh","raw_affiliation_strings":["George Mason University,Department of Computer Science"],"affiliations":[{"raw_affiliation_string":"George Mason University,Department of Computer Science","institution_ids":["https://openalex.org/I162714631"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017662025","display_name":"Xuesu Xiao","orcid":"https://orcid.org/0000-0001-5151-2186"},"institutions":[{"id":"https://openalex.org/I162714631","display_name":"George Mason University","ror":"https://ror.org/02jqj7156","country_code":"US","type":"education","lineage":["https://openalex.org/I162714631"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xuesu Xiao","raw_affiliation_strings":["George Mason University,Department of Computer Science"],"affiliations":[{"raw_affiliation_string":"George Mason University,Department of Computer Science","institution_ids":["https://openalex.org/I162714631"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5045302522"],"corresponding_institution_ids":["https://openalex.org/I162714631"],"apc_list":null,"apc_paid":null,"fwci":2.609,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.92455452,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"2741","last_page":"2746"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.8356999754905701,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.8356999754905701,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13382","display_name":"Robotics and Automated Systems","score":0.8288999795913696,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.78184974193573},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6765267848968506},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6210557818412781},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.5974056124687195},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.5261877775192261},{"id":"https://openalex.org/keywords/action-recognition","display_name":"Action recognition","score":0.484971284866333},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.07245132327079773}],"concepts":[{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.78184974193573},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6765267848968506},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6210557818412781},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.5974056124687195},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5261877775192261},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.484971284866333},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.07245132327079773},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iros58592.2024.10802451","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros58592.2024.10802451","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320338281","display_name":"Army Research Office","ror":"https://ror.org/05epdh915"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W2108598243","https://openalex.org/W2194775991","https://openalex.org/W2752796333","https://openalex.org/W2962894046","https://openalex.org/W2963341956","https://openalex.org/W2986357018","https://openalex.org/W3009593063","https://openalex.org/W4206954811","https://openalex.org/W4285117889","https://openalex.org/W4385245566","https://openalex.org/W4389666015","https://openalex.org/W4391800510","https://openalex.org/W4402401349","https://openalex.org/W6684338915","https://openalex.org/W6757817989","https://openalex.org/W6784333009","https://openalex.org/W6791742336","https://openalex.org/W6795754764","https://openalex.org/W6810080435","https://openalex.org/W6850261633","https://openalex.org/W6853468020","https://openalex.org/W6857060602","https://openalex.org/W6858547017","https://openalex.org/W6858612488","https://openalex.org/W6859169318"],"related_works":["https://openalex.org/W2772917594","https://openalex.org/W2036807459","https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Humans":[0],"excel":[1],"at":[2],"efficiently":[3],"navigating":[4],"through":[5],"crowds":[6],"without":[7],"collision":[8],"by":[9,87,169],"focusing":[10],"on":[11,25,30,124,204],"specific":[12,125],"visual":[13,21,126,143],"regions":[14,127],"relevant":[15,38,130],"to":[16,39,72,78,121,131],"navigation.":[17],"However,":[18],"most":[19,180],"robotic":[20,73],"navigation":[22,40,48,74,133,186],"methods":[23],"rely":[24],"deep":[26],"learning":[27,60],"models":[28,49,193,202],"pre-trained":[29],"vision":[31,64],"tasks,":[32],"which":[33],"prioritize":[34],"salient":[35,107],"objects\u2014not":[36],"necessarily":[37],"and":[41,65,147,153,201],"potentially":[42],"misleading.":[43],"Alternative":[44],"approaches":[45],"train":[46],"specialized":[47],"from":[50],"scratch,":[51],"requiring":[52],"significant":[53],"computation.":[54],"On":[55],"the":[56,79,132,164,167,198],"other":[57],"hand,":[58],"self-supervised":[59],"has":[61],"revolutionized":[62],"computer":[63],"natural":[66],"language":[67],"processing,":[68],"but":[69],"its":[70],"application":[71],"remains":[75],"underexplored":[76],"due":[77],"difficulty":[80],"of":[81,105,142],"defining":[82],"effective":[83],"self-supervision":[84],"signals.":[85],"Motivated":[86],"these":[88],"observations,":[89,144],"in":[90],"this":[91],"work,":[92],"we":[93],"propose":[94],"a":[95,140,148,171,205],"Self-Supervised":[96],"Vision-Action":[97],"Model":[98],"for":[99,112,151],"Visual":[100],"Navigation":[101],"Pre-Training":[102],"(VANP).":[103],"Instead":[104],"detecting":[106],"objects":[108],"that":[109,128,179],"are":[110,129],"beneficial":[111],"tasks":[113],"such":[114],"as":[115,192],"classification":[116],"or":[117],"detection,":[118],"VANP":[119,138,162,188],"learns":[120],"focus":[122],"only":[123,213],"task.":[134],"To":[135],"achieve":[136],"this,":[137],"uses":[139],"history":[141],"future":[145],"actions,":[146],"goal":[149],"image":[150],"self-supervision,":[152],"embeds":[154],"them":[155],"using":[156,170],"two":[157],"small":[158],"Transformer":[159],"Encoders.":[160],"Then,":[161],"maximizes":[163],"information":[165,173],"between":[166],"embeddings":[168],"mutual":[172],"maximization":[174],"objective":[175],"function.":[176],"We":[177],"demonstrate":[178],"VANP-extracted":[181],"features":[182],"match":[183],"with":[184,196,212],"human":[185],"intuition.":[187],"achieves":[189],"comparable":[190],"performance":[191],"learned":[194],"end-to-end":[195],"half":[197],"training":[199],"time":[200],"trained":[203],"large-scale,":[206],"fully":[207],"supervised":[208],"dataset,":[209],"i.e.,":[210],"ImageNet,":[211],"0.08%":[214],"data.":[215],"<sup":[216],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[217],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[218]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-27T14:29:43.386196","created_date":"2025-10-10T00:00:00"}
