{"id":"https://openalex.org/W7134903267","doi":"https://doi.org/10.48550/arxiv.2603.09512","title":"Probing the Reliability of Driving VLMs: From Inconsistent Responses to Grounded Temporal Reasoning","display_name":"Probing the Reliability of Driving VLMs: From Inconsistent Responses to Grounded Temporal Reasoning","publication_year":2026,"publication_date":"2026-03-10","ids":{"openalex":"https://openalex.org/W7134903267","doi":"https://doi.org/10.48550/arxiv.2603.09512"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.09512","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09512","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.09512","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128759424","display_name":"Chun-Peng Chang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chang, Chun-Peng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128705400","display_name":"Chen-Yu Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Chen-Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001917215","display_name":"Holger Caesar","orcid":"https://orcid.org/0000-0001-5099-6297"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Caesar, Holger","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5046969927","display_name":"Alain Pagani","orcid":"https://orcid.org/0000-0002-5136-0837"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pagani, Alain","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5128759424"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.982200026512146,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.982200026512146,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.0032999999821186066,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.0008999999845400453,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5792999863624573},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.5462999939918518},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.5324000120162964},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.460999995470047},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.44920000433921814},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.4449999928474426},{"id":"https://openalex.org/keywords/interpretation","display_name":"Interpretation (philosophy)","score":0.3939000070095062},{"id":"https://openalex.org/keywords/contrast","display_name":"Contrast (vision)","score":0.375900000333786}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6647999882698059},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5792999863624573},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.5462999939918518},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.5324000120162964},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5252000093460083},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.460999995470047},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4523000121116638},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.44920000433921814},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.4449999928474426},{"id":"https://openalex.org/C527412718","wikidata":"https://www.wikidata.org/wiki/Q855395","display_name":"Interpretation (philosophy)","level":2,"score":0.3939000070095062},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.375900000333786},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.3686000108718872},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.32989999651908875},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.32580000162124634},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.2992999851703644},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.2939999997615814},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.29260000586509705},{"id":"https://openalex.org/C77277458","wikidata":"https://www.wikidata.org/wiki/Q1969246","display_name":"Temporal database","level":2,"score":0.27410000562667847},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.25780001282691956},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.25119999051094055}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.09512","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09512","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.09512","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.09512","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.76737380027771}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"A":[0],"reliable":[1,88],"driving":[2,29],"assistant":[3],"should":[4],"provide":[5],"consistent":[6],"responses":[7,120],"based":[8],"on":[9,97,164,174],"temporally":[10,53],"grounded":[11,54],"reasoning":[12,84,218,225],"derived":[13],"from":[14,139],"observed":[15],"information.":[16],"In":[17,205],"this":[18,105],"work,":[19],"we":[20,92,151,186,207],"investigate":[21],"whether":[22,43],"Vision-Language":[23],"Models":[24],"(VLMs),":[25],"when":[26],"applied":[27],"as":[28],"assistants,":[30],"can":[31],"response":[32,107],"consistantly":[33],"and":[34,71,85,125,135,191,223],"understand":[35],"how":[36],"present":[37],"observations":[38],"shape":[39],"future":[40,83,202],"outcomes,":[41],"or":[42,146],"their":[44],"outputs":[45],"merely":[46],"reflect":[47],"patterns":[48,176],"memorized":[49],"during":[50],"training":[51],"without":[52,226],"reasoning.":[55,204],"While":[56],"recent":[57],"efforts":[58],"have":[59],"integrated":[60],"VLMs":[61],"into":[62],"autonomous":[63],"driving,":[64],"prior":[65],"studies":[66],"typically":[67],"emphasize":[68],"scene":[69,203],"understanding":[70,158],"instruction":[72],"generation,":[73],"implicitly":[74],"assuming":[75],"that":[76,153,219],"strong":[77,156],"visual":[78,157],"interpretation":[79],"naturally":[80],"enables":[81],"consistant":[82],"thus":[86],"ensures":[87],"decision-making,":[89],"a":[90,170,194,209],"claim":[91],"critically":[93],"examine.":[94],"We":[95],"focus":[96],"two":[98],"major":[99],"challenges":[100],"limiting":[101],"VLM":[102],"reliability":[103],"in":[104,117,129,144],"setting:":[106],"inconsistency,":[108],"where":[109],"minor":[110],"input":[111],"perturbations":[112],"yield":[113],"different":[114],"answers":[115],"or,":[116],"some":[118],"cases,":[119],"degenerate":[121],"toward":[122],"near-random":[123],"guessing,":[124],"limited":[126],"temporal":[127,167,180,224,228],"reasoning,":[128,168],"which":[130],"models":[131,154],"fail":[132],"to":[133,172,200],"reason":[134],"align":[136],"sequential":[137],"events":[138],"current":[140],"observations,":[141],"often":[142],"resulting":[143],"incorrect":[145],"even":[147],"contradictory":[148],"responses.":[149],"Moreover,":[150],"find":[152],"with":[155,216],"do":[159],"not":[160],"necessarily":[161],"perform":[162],"best":[163],"tasks":[165],"requiring":[166,227],"indicating":[169],"tendency":[171],"over-rely":[173],"pretrained":[175],"rather":[177],"than":[178],"modeling":[179],"dynamics.":[181],"To":[182],"address":[183],"these":[184],"issues,":[185],"adopt":[187],"existing":[188],"evaluation":[189],"methods":[190],"introduce":[192],"FutureVQA,":[193],"human-annotated":[195],"benchmark":[196],"dataset":[197],"specifically":[198],"designed":[199],"assess":[201],"addition,":[206],"propose":[208],"simple":[210],"yet":[211],"effective":[212],"self-supervised":[213],"tuning":[214],"approach":[215],"chain-of-thought":[217],"improves":[220],"both":[221],"consistency":[222],"labels.":[229]},"counts_by_year":[],"updated_date":"2026-03-12T06:18:43.230356","created_date":"2026-03-12T00:00:00"}
