{"id":"https://openalex.org/W7143468641","doi":"https://doi.org/10.48550/arxiv.2603.26589","title":"The Limits of Learning from Pictures and Text: Vision-Language Models and Embodied Scene Understanding","display_name":"The Limits of Learning from Pictures and Text: Vision-Language Models and Embodied Scene Understanding","publication_year":2026,"publication_date":"2026-03-27","ids":{"openalex":"https://openalex.org/W7143468641","doi":"https://doi.org/10.48550/arxiv.2603.26589"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.26589","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26589","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.26589","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5111495684","display_name":"Gillian Rosenberg","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Rosenberg, Gillian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119001097","display_name":"Skylar Stadhard","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stadhard, Skylar","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130922413","display_name":"Bruce C. Hansen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hansen, Bruce C.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5030173399","display_name":"Michelle R. Greene","orcid":"https://orcid.org/0000-0002-0597-4715"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Greene, Michelle R.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5111495684"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8287000060081482,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8287000060081482,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11431","display_name":"Action Observation and Synchronization","score":0.043299999088048935,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11148","display_name":"Language, Metaphor, and Cognition","score":0.018300000578165054,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embodied-cognition","display_name":"Embodied cognition","score":0.7748000025749207},{"id":"https://openalex.org/keywords/affordance","display_name":"Affordance","score":0.7494000196456909},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.6793000102043152},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.517300009727478},{"id":"https://openalex.org/keywords/cognition","display_name":"Cognition","score":0.489300012588501},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.4860999882221222},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.4855000078678131},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.36309999227523804},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.35199999809265137}],"concepts":[{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.7748000025749207},{"id":"https://openalex.org/C194995250","wikidata":"https://www.wikidata.org/wiki/Q531136","display_name":"Affordance","level":2,"score":0.7494000196456909},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.6793000102043152},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5622000098228455},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.517300009727478},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5097000002861023},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.489300012588501},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.4860999882221222},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.4855000078678131},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4154999852180481},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.3653999865055084},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.36309999227523804},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.35199999809265137},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.3508000075817108},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.3386000096797943},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.31679999828338623},{"id":"https://openalex.org/C2778662690","wikidata":"https://www.wikidata.org/wiki/Q3125339","display_name":"Spatial ability","level":3,"score":0.313400000333786},{"id":"https://openalex.org/C207551092","wikidata":"https://www.wikidata.org/wiki/Q508969","display_name":"Cognitive linguistics","level":3,"score":0.303600013256073},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.303600013256073},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.29919999837875366},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.298799991607666},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.2872999906539917},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.2818000018596649},{"id":"https://openalex.org/C2982736386","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Statistical learning","level":2,"score":0.2816999852657318},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2800000011920929},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.2766999900341034},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.2685999870300293},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.26820001006126404},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.26440000534057617},{"id":"https://openalex.org/C2780762811","wikidata":"https://www.wikidata.org/wiki/Q1784941","display_name":"Cosine similarity","level":3,"score":0.2556999921798706},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.25200000405311584},{"id":"https://openalex.org/C64357122","wikidata":"https://www.wikidata.org/wiki/Q1149766","display_name":"Causality (physics)","level":2,"score":0.25200000405311584},{"id":"https://openalex.org/C94966114","wikidata":"https://www.wikidata.org/wiki/Q29256","display_name":"Black box","level":2,"score":0.25040000677108765}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.26589","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26589","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.26589","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26589","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7947246432304382,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"What":[0],"information":[1],"is":[2,218],"sufficient":[3],"to":[4,65,109],"learn":[5],"the":[6,18,26,52,110,165,234],"full":[7],"richness":[8],"of":[9,21,51,67,112,197,228,236],"human":[10,70,113,229],"scene":[11,75,222],"understanding?":[12],"The":[13],"distributional":[14,53,212],"hypothesis":[15],"holds":[16],"that":[17,104,138,164,184,211,225,240],"statistical":[19],"co-occurrence":[20],"language":[22],"and":[23,86,142,172,216],"images":[24,215],"captures":[25],"conceptual":[27],"knowledge":[28,128,200],"underlying":[29],"visual":[30,230],"cognition.":[31],"Vision-language":[32],"models":[33],"(VLMs)":[34],"are":[35],"trained":[36],"on":[37,126],"massive":[38],"paired":[39],"text-image":[40],"corpora":[41],"but":[42,130],"lack":[43,92],"embodied":[44,199],"experience,":[45],"making":[46],"them":[47],"an":[48],"ideal":[49],"test":[50],"hypothesis.":[54],"We":[55],"report":[56],"two":[57],"experiments":[58],"comparing":[59],"descriptions":[60],"generated":[61],"by":[62,116,176],"18":[63],"VLMs":[64,122],"those":[66],"over":[68],"2000":[69],"observers":[71],"across":[72],"15":[73],"high-level":[74],"understanding":[76],"tasks,":[77,129],"spanning":[78],"general":[79,127],"knowledge,":[80],"affordances,":[81],"sensory":[82],"experiences,":[83],"affective":[84],"responses,":[85,114],"future":[87],"prediction.":[88],"Because":[89],"many":[90],"tasks":[91,137],"ground":[93],"truth":[94],"answers,":[95],"we":[96,153],"developed":[97],"a":[98,132],"Human-Calibrated":[99],"Cosine":[100],"Distance":[101],"(HCD)":[102],"metric":[103],"measures":[105],"VLM":[106],"output":[107],"similarity":[108],"distribution":[111],"scaled":[115],"within-human":[117],"variability.":[118],"In":[119,150],"Experiment":[120,151],"1,":[121],"approached":[123],"human-level":[124],"performance":[125],"showed":[131],"robust":[133],"deficit":[134,166],"for":[135,158,220],"affordance":[136,161,191],"resisted":[139],"prompt":[140],"engineering":[141],"did":[143],"not":[144,174],"improve":[145],"with":[146,194],"newer":[147],"model":[148],"releases.":[149],"2,":[152],"tested":[154],"six":[155],"mechanistic":[156],"hypotheses":[157],"explaining":[159],"this":[160],"gap,":[162],"finding":[163],"was":[167,173],"structural":[168],"rather":[169],"than":[170],"stylistic":[171],"resolved":[175],"providing":[177],"explicit":[178],"spatial":[179],"information.":[180],"Corpus":[181],"analyses":[182],"revealed":[183],"image":[185],"captioning":[186],"datasets":[187],"contain":[188],"sparse":[189],"agent-addressed":[190],"language,":[192],"consistent":[193],"Gricean":[195],"accounts":[196],"why":[198],"may":[201,232],"be":[202],"systematically":[203],"underrepresented":[204],"in":[205],"language.":[206],"Together,":[207],"these":[208],"findings":[209],"suggest":[210],"learning":[213],"from":[214],"text":[217],"insufficient":[219],"affordance-based":[221],"understanding,":[223],"implying":[224],"some":[226],"dimensions":[227],"cognition":[231],"require":[233],"kind":[235],"agent-centered,":[237],"three-dimensional":[238],"experience":[239],"no":[241],"photograph":[242],"or":[243],"caption":[244],"can":[245],"encode.":[246]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-03-31T00:00:00"}
