{"id":"https://openalex.org/W7135079647","doi":"https://doi.org/10.48550/arxiv.2603.10703","title":"WalkGPT: Grounded Vision-Language Conversation with Depth-Aware Segmentation for Pedestrian Navigation","display_name":"WalkGPT: Grounded Vision-Language Conversation with Depth-Aware Segmentation for Pedestrian Navigation","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7135079647","doi":"https://doi.org/10.48550/arxiv.2603.10703"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.10703","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10703","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.10703","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5068116325","display_name":"Rafi Ibn Sultan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sultan, Rafi Ibn","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128884402","display_name":"Hui Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Hui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128871881","display_name":"Xiangyu Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Xiangyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014432190","display_name":"Chengyin Li","orcid":"https://orcid.org/0000-0003-2450-9760"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chengyin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015133769","display_name":"Prashant Khanduri","orcid":"https://orcid.org/0000-0003-3055-2917"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Khanduri, Prashant","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082375623","display_name":"Marco Brocanelli","orcid":"https://orcid.org/0000-0002-3603-1402"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brocanelli, Marco","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128901573","display_name":"Dongxiao Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Dongxiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9377999901771545,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9377999901771545,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.03460000082850456,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.004100000020116568,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6233999729156494},{"id":"https://openalex.org/keywords/conversation","display_name":"Conversation","score":0.5694000124931335},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5437999963760376},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4982999861240387},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4844000041484833},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4627000093460083},{"id":"https://openalex.org/keywords/pedestrian","display_name":"Pedestrian","score":0.44440001249313354},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4228000044822693},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.3855000138282776}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7681999802589417},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6614999771118164},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6233999729156494},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.578000009059906},{"id":"https://openalex.org/C2777200299","wikidata":"https://www.wikidata.org/wiki/Q52943","display_name":"Conversation","level":2,"score":0.5694000124931335},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5437999963760376},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4982999861240387},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4844000041484833},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4627000093460083},{"id":"https://openalex.org/C2777113093","wikidata":"https://www.wikidata.org/wiki/Q221488","display_name":"Pedestrian","level":2,"score":0.44440001249313354},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4228000044822693},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.3855000138282776},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.38280001282691956},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3619999885559082},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.34139999747276306},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.34060001373291016},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.3379000127315521},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3375999927520752},{"id":"https://openalex.org/C2776865275","wikidata":"https://www.wikidata.org/wiki/Q311666","display_name":"Projector","level":2,"score":0.3156000077724457},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.301800012588501},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.3003999888896942},{"id":"https://openalex.org/C2779231336","wikidata":"https://www.wikidata.org/wiki/Q7534724","display_name":"Sketch","level":2,"score":0.296099990606308},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2809000015258789},{"id":"https://openalex.org/C60229501","wikidata":"https://www.wikidata.org/wiki/Q18822","display_name":"Global Positioning System","level":2,"score":0.27619999647140503},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2702000141143799},{"id":"https://openalex.org/C16345878","wikidata":"https://www.wikidata.org/wiki/Q107472979","display_name":"Orientation (vector space)","level":2,"score":0.2578999996185303},{"id":"https://openalex.org/C522192633","wikidata":"https://www.wikidata.org/wiki/Q34228","display_name":"Sign language","level":2,"score":0.25769999623298645},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.2556999921798706},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2542000114917755}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.10703","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10703","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.10703","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10703","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/11","score":0.6610536575317383,"display_name":"Sustainable cities and communities"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Ensuring":[0],"accessible":[1,98],"pedestrian":[2],"navigation":[3,86,172],"requires":[4],"reasoning":[5,69,199],"about":[6],"both":[7],"semantic":[8],"and":[9,43,70,84,99,130,155,170,189,200,206],"spatial":[10,128],"aspects":[11],"of":[12,36,63,181],"complex":[13],"urban":[14],"scenes,":[15],"a":[16,56,73,81,85,90,110,131,138,178],"challenge":[17],"that":[18,96,115,143,194],"existing":[19],"Large":[20],"Vision-Language":[21],"Models":[22],"(LVLMs)":[23],"struggle":[24],"to":[25,40,167],"meet.":[26],"Although":[27],"these":[28],"models":[29],"can":[30],"describe":[31],"visual":[32],"content,":[33],"their":[34,48],"lack":[35],"explicit":[37],"grounding":[38,154],"leads":[39],"object":[41],"hallucinations":[42],"unreliable":[44],"depth":[45,105,156],"reasoning,":[46],"limiting":[47],"usefulness":[49],"for":[50,59,76],"accessibility":[51,78],"guidance.":[52,79,173],"We":[53,174],"introduce":[54,176],"WalkGPT,":[55],"pixel-grounded":[57],"LVLM":[58],"the":[60,117,165,211],"new":[61],"task":[62],"Grounded":[64],"Navigation":[65],"Guide,":[66],"unifying":[67],"language":[68,145],"segmentation":[71,94,201],"within":[72],"single":[74],"architecture":[75],"depth-aware":[77],"Given":[80],"pedestrian-view":[82,183],"image":[83,119],"query,":[87],"WalkGPT":[88,195],"generates":[89],"conversational":[91],"response":[92],"with":[93,103,186],"masks":[95],"delineate":[97],"harmful":[100],"features,":[101],"along":[102,124],"relative":[104],"estimation.":[106],"The":[107,203],"model":[108,166],"incorporates":[109],"Multi-Scale":[111],"Query":[112],"Projector":[113,134],"(MSQP)":[114],"shapes":[116],"final":[118],"tokens":[120,126],"by":[121,137],"aggregating":[122],"them":[123],"text":[125],"across":[127],"hierarchies,":[129],"Calibrated":[132],"Text":[133],"(CTP),":[135],"guided":[136],"proposed":[139],"Region":[140],"Alignment":[141],"Loss,":[142],"maps":[144],"embeddings":[146],"into":[147],"segmentation-aware":[148],"representations.":[149],"These":[150],"components":[151],"enable":[152],"fine-grained":[153],"inference":[157],"without":[158],"user-provided":[159],"cues":[160],"or":[161],"anchor":[162],"points,":[163],"allowing":[164],"generate":[168],"complete":[169],"realistic":[171],"also":[175],"PAVE,":[177],"large-scale":[179],"benchmark":[180],"41k":[182],"images":[184],"paired":[185],"accessibility-aware":[187],"questions":[188],"depth-grounded":[190],"answers.":[191],"Experiments":[192],"show":[193],"achieves":[196],"strong":[197],"grounded":[198],"performance.":[202],"source":[204],"code":[205],"dataset":[207],"are":[208],"available":[209],"on":[210],"\\href{https://sites.google.com/view/walkgpt-26/home}{project":[212],"website}.":[213]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-13T00:00:00"}
