{"id":"https://openalex.org/W7133489736","doi":"https://doi.org/10.48550/arxiv.2603.02972","title":"TagaVLM: Topology-Aware Global Action Reasoning for Vision-Language Navigation","display_name":"TagaVLM: Topology-Aware Global Action Reasoning for Vision-Language Navigation","publication_year":2026,"publication_date":"2026-03-03","ids":{"openalex":"https://openalex.org/W7133489736","doi":"https://doi.org/10.48550/arxiv.2603.02972"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.02972","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02972","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.02972","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128085172","display_name":"Jiaxing Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Jiaxing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128080946","display_name":"Zexi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zexi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128119880","display_name":"Xiaoyan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xiaoyan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091545406","display_name":"Boyue Wang","orcid":"https://orcid.org/0000-0002-2677-8342"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Boyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128083118","display_name":"Yongli Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Yongli","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128109717","display_name":"Baocai Yin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yin, Baocai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9868999719619751,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9868999719619751,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.0017000000225380063,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.0012000000569969416,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.6337000131607056},{"id":"https://openalex.org/keywords/forcing","display_name":"Forcing (mathematics)","score":0.6208999752998352},{"id":"https://openalex.org/keywords/topology","display_name":"Topology (electrical circuits)","score":0.5659000277519226},{"id":"https://openalex.org/keywords/path","display_name":"Path (computing)","score":0.5220999717712402},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.5113999843597412},{"id":"https://openalex.org/keywords/path-integration","display_name":"Path integration","score":0.508899986743927},{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.46369999647140503},{"id":"https://openalex.org/keywords/node","display_name":"Node (physics)","score":0.4309000074863434},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.42590001225471497}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7099000215530396},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.6337000131607056},{"id":"https://openalex.org/C197115733","wikidata":"https://www.wikidata.org/wiki/Q1003136","display_name":"Forcing (mathematics)","level":2,"score":0.6208999752998352},{"id":"https://openalex.org/C184720557","wikidata":"https://www.wikidata.org/wiki/Q7825049","display_name":"Topology (electrical circuits)","level":2,"score":0.5659000277519226},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.5220999717712402},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.5113999843597412},{"id":"https://openalex.org/C96522737","wikidata":"https://www.wikidata.org/wiki/Q17148345","display_name":"Path integration","level":2,"score":0.508899986743927},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.46369999647140503},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4577000141143799},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.4309000074863434},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.42590001225471497},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.38100001215934753},{"id":"https://openalex.org/C81074085","wikidata":"https://www.wikidata.org/wiki/Q366872","display_name":"Motion planning","level":3,"score":0.36570000648498535},{"id":"https://openalex.org/C2778478883","wikidata":"https://www.wikidata.org/wiki/Q7825025","display_name":"Topological complexity","level":2,"score":0.3643999993801117},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.34630000591278076},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3343000113964081},{"id":"https://openalex.org/C199845137","wikidata":"https://www.wikidata.org/wiki/Q145490","display_name":"Network topology","level":2,"score":0.3221000134944916},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.3197999894618988},{"id":"https://openalex.org/C151201525","wikidata":"https://www.wikidata.org/wiki/Q177239","display_name":"Limit (mathematics)","level":2,"score":0.2903999984264374},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.2888000011444092},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.2603999972343445},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2574000060558319},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.25540000200271606},{"id":"https://openalex.org/C135798126","wikidata":"https://www.wikidata.org/wiki/Q2167279","display_name":"Top-down and bottom-up design","level":2,"score":0.25360000133514404},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.25200000405311584},{"id":"https://openalex.org/C2781179811","wikidata":"https://www.wikidata.org/wiki/Q28134081","display_name":"Spatial configuration","level":3,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.02972","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02972","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.02972","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02972","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language":[0,9],"Navigation":[1,124],"(VLN)":[2],"presents":[3],"a":[4,161],"unique":[5],"challenge":[6],"for":[7,145,188],"Large":[8],"Models":[10],"(VLMs)":[11],"due":[12],"to":[13,44,55],"their":[14,63],"inherent":[15],"architectural":[16],"mismatch:":[17],"VLMs":[18,197],"are":[19],"primarily":[20],"pretrained":[21,115],"on":[22,194,211],"static,":[23],"disembodied":[24],"vision-language":[25],"tasks,":[26],"which":[27],"fundamentally":[28],"clash":[29],"with":[30,131,160],"the":[31,87,105,132,136,150],"dynamic,":[32],"embodied,":[33],"and":[34,48,167,181],"spatially-structured":[35],"nature":[36],"of":[37,140,165,169],"navigation.":[38],"Existing":[39],"large-model-based":[40,158],"methods":[41],"often":[42],"resort":[43],"converting":[45],"rich":[46],"visual":[47],"spatial":[49,111,190],"information":[50],"into":[51,86,104],"text,":[52],"forcing":[53],"models":[54],"implicitly":[56],"infer":[57],"complex":[58],"visual-topological":[59],"relationships":[60],"or":[61],"limiting":[62],"global":[64,141],"action":[65,142],"capabilities.":[66],"To":[67,90,117],"bridge":[68],"this":[69],"gap,":[70],"we":[71],"propose":[72],"TagaVLM":[73,153],"(Topology-Aware":[74],"Global":[75],"Action":[76],"reasoning),":[77],"an":[78,122],"end-to-end":[79],"framework":[80],"that":[81],"explicitly":[82],"injects":[83],"topological":[84,92,119,134],"structures":[85],"VLM":[88],"backbone.":[89],"introduce":[91],"edge":[93],"information,":[94,121],"Spatial":[95],"Topology":[96],"Aware":[97],"Residual":[98],"Attention":[99],"(STAR-Att)":[100],"directly":[101],"integrates":[102],"it":[103],"VLM's":[106],"self-attention":[107],"mechanism,":[108],"enabling":[109],"intrinsic":[110],"reasoning":[112],"while":[113],"preserving":[114],"knowledge.":[116],"enhance":[118],"node":[120],"Interleaved":[123],"Prompt":[125],"strengthens":[126],"node-level":[127],"visual-text":[128],"alignment.":[129],"Finally,":[130],"embedded":[133],"graph,":[135],"model":[137,204],"is":[138],"capable":[139],"reasoning,":[143,191],"allowing":[144],"robust":[146],"path":[147],"correction.":[148],"On":[149],"R2R":[151],"benchmark,":[152],"achieves":[154],"state-of-the-art":[155],"performance":[156],"among":[157],"methods,":[159],"Success":[162],"Rate":[163],"(SR)":[164],"51.09%":[166],"SPL":[168],"47.18":[170],"in":[171,179,183],"unseen":[172],"environments,":[173],"outperforming":[174],"prior":[175],"work":[176],"by":[177],"3.39%":[178],"SR":[180],"9.08":[182],"SPL.":[184],"This":[185],"demonstrates":[186],"that,":[187],"embodied":[189],"targeted":[192],"enhancements":[193],"smaller":[195],"open-source":[196],"can":[198,208],"be":[199,209],"more":[200],"effective":[201],"than":[202],"brute-force":[203],"scaling.":[205],"The":[206],"code":[207],"found":[210],"our":[212],"project":[213],"page:":[214],"https://apex-bjut.github.io/Taga-VLM":[215]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-05T00:00:00"}
