{"id":"https://openalex.org/W7161121918","doi":"https://doi.org/10.48550/arxiv.2605.13737","title":"Senses Wide Shut: A Representation-Action Gap in Omnimodal LLMs","display_name":"Senses Wide Shut: A Representation-Action Gap in Omnimodal LLMs","publication_year":2026,"publication_date":"2026-05-13","ids":{"openalex":"https://openalex.org/W7161121918","doi":"https://doi.org/10.48550/arxiv.2605.13737"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.13737","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.13737","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.13737","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136182241","display_name":"Trung Nguyen Quang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Quang, Trung Nguyen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102887433","display_name":"Yiming Gao","orcid":"https://orcid.org/0000-0001-7705-7142"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Yiming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109714657","display_name":"Fanyi Pu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pu, Fanyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136182868","display_name":"Kaichen Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Kaichen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136119644","display_name":"Shuo Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Shuo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136129904","display_name":"Ziwei Liu","orcid":"https://orcid.org/0009-0007-1778-8577"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ziwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10465","display_name":"Neurobiology of Language and Bilingualism","score":0.3091999888420105,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10465","display_name":"Neurobiology of Language and Bilingualism","score":0.3091999888420105,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10730","display_name":"Language Development and Disorders","score":0.16940000653266907,"subfield":{"id":"https://openalex.org/subfields/3204","display_name":"Developmental and Educational Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.11710000038146973,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/premise","display_name":"Premise","score":0.8432999849319458},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5228999853134155},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5167999863624573},{"id":"https://openalex.org/keywords/comprehension","display_name":"Comprehension","score":0.5040000081062317},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.4945000112056732},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4090999960899353},{"id":"https://openalex.org/keywords/common-ground","display_name":"Common ground","score":0.39070001244544983},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.38100001215934753},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.36890000104904175}],"concepts":[{"id":"https://openalex.org/C2778023277","wikidata":"https://www.wikidata.org/wiki/Q321703","display_name":"Premise","level":2,"score":0.8432999849319458},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5228999853134155},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5167999863624573},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.5040000081062317},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4945000112056732},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4560999870300293},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4090999960899353},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.39750000834465027},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3935000002384186},{"id":"https://openalex.org/C2777877512","wikidata":"https://www.wikidata.org/wiki/Q1116097","display_name":"Common ground","level":2,"score":0.39070001244544983},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.38100001215934753},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.37310001254081726},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.36890000104904175},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.35929998755455017},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.357699990272522},{"id":"https://openalex.org/C140331021","wikidata":"https://www.wikidata.org/wiki/Q1868104","display_name":"Logit","level":2,"score":0.35109999775886536},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.34119999408721924},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.32690000534057617},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.320499986410141},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.31369999051094055},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.3059999942779541},{"id":"https://openalex.org/C30038468","wikidata":"https://www.wikidata.org/wiki/Q4354775","display_name":"Memorization","level":2,"score":0.2799000144004822},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.2743000090122223},{"id":"https://openalex.org/C2778311575","wikidata":"https://www.wikidata.org/wiki/Q18534","display_name":"Metaphor","level":2,"score":0.26510000228881836},{"id":"https://openalex.org/C2777617010","wikidata":"https://www.wikidata.org/wiki/Q18957","display_name":"Mainstream","level":2,"score":0.2574000060558319},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2533999979496002},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.2522999942302704},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.25189998745918274}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.13737","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.13737","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.13737","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.13737","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.6592689752578735}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"When":[0],"an":[1,184],"omnimodal":[2,29,101,212],"large":[3],"language":[4],"model":[5],"accepts":[6],"a":[7,44,52,66,74,109,188],"question":[8],"whose":[9],"textual":[10,53],"premise":[11,83,150],"contradicts":[12],"what":[13],"it":[14],"actually":[15],"sees":[16],"or":[17,25],"hears,":[18],"does":[19],"the":[20,58,120,126,148,194,209],"failure":[21,137],"lie":[22],"in":[23,26,129,140,155,215],"perception":[24],"action?":[27],"Recent":[28],"models":[30,122,133],"are":[31],"positioned":[32],"as":[33,146],"perception-grounded":[34],"agents":[35],"that":[36,55],"jointly":[37],"process":[38],"video,":[39],"audio,":[40],"and":[41,82,103,153,178,200],"text,":[42],"yet":[43],"basic":[45],"form":[46],"of":[47,70],"grounding":[48,175,213],"remains":[49],"untested:":[50],"catching":[51],"claim":[54,128],"conflicts":[56],"with":[57,73],"model's":[59],"own":[60],"sensory":[61],"input.":[62],"We":[63],"introduce":[64],"IMAVB,":[65],"curated":[67],"500-clip":[68],"benchmark":[69],"long-form":[71],"movies":[72],"2x2":[75],"design":[76],"crossing":[77],"target":[78],"modality":[79],"(vision,":[80],"audio)":[81],"condition":[84],"(standard,":[85],"misleading),":[86],"which":[87,141,156],"lets":[88],"us":[89],"measure":[90],"conflict":[91],"detection":[92],"separately":[93],"from":[94],"ordinary":[95,167],"multimodal":[96],"comprehension.":[97],"Across":[98],"eight":[99],"open-source":[100],"LLMs":[102],"Gemini":[104],"3.1":[105],"Pro,":[106],"we":[107],"document":[108],"Representation-Action":[110],"Gap:":[111],"hidden":[112],"states":[113],"reliably":[114],"encode":[115],"premise-perception":[116],"mismatches":[117],"even":[118],"when":[119],"same":[121],"almost":[123],"never":[124],"reject":[125,158,163],"false":[127,149],"their":[130],"outputs.":[131],"Behaviorally,":[132],"fall":[134],"into":[135,198],"two":[136],"modes:":[138],"under-rejection,":[139],"they":[142,157],"answer":[143],"misleading":[144],"questions":[145],"if":[147],"were":[151],"true;":[152],"over-rejection,":[154],"more":[159],"often":[160],"but":[161],"also":[162],"standard":[164],"questions,":[165],"sacrificing":[166],"comprehension":[168],"accuracy.":[169],"The":[170],"gap":[171],"is":[172],"modality-asymmetric":[173],"(audio":[174],"underperforms":[176],"vision)":[177],"prompt-resistant":[179],"across":[180],"seven":[181],"variants.":[182],"As":[183],"initial":[185],"diagnostic":[186],"intervention,":[187],"probe-guided":[189],"logit":[190],"adjustment":[191],"(PGLA)":[192],"re-injects":[193],"encoded":[195],"mismatch":[196],"signal":[197],"decoding":[199],"consistently":[201],"improves":[202],"rejection":[203],"behavior.":[204],"Together,":[205],"these":[206],"results":[207],"suggest":[208],"bottleneck":[210],"for":[211],"lies":[214],"translation,":[216],"not":[217],"perception.":[218]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-15T00:00:00"}
