{"id":"https://openalex.org/W7140201664","doi":"https://doi.org/10.48550/arxiv.2603.22278","title":"The Dual Mechanisms of Spatial Variable Binding in Vision-Language Models","display_name":"The Dual Mechanisms of Spatial Variable Binding in Vision-Language Models","publication_year":2026,"publication_date":"2026-03-23","ids":{"openalex":"https://openalex.org/W7140201664","doi":"https://doi.org/10.48550/arxiv.2603.22278"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.22278","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22278","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.22278","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Cui, Kelly","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cui, Kelly","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Prakash, Nikhil","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Prakash, Nikhil","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Messica, Shoval","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Messica, Shoval","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Raina, Ayush","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Raina, Ayush","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Bau, David","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bau, David","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Torralba, Antonio","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Torralba, Antonio","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Shaham, Tamar Rott","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shaham, Tamar Rott","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9886999726295471,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9886999726295471,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11148","display_name":"Language, Metaphor, and Cognition","score":0.0034000000450760126,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.0006000000284984708,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.729200005531311},{"id":"https://openalex.org/keywords/spatial-relation","display_name":"Spatial relation","score":0.6736999750137329},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.6090999841690063},{"id":"https://openalex.org/keywords/spatial-analysis","display_name":"Spatial analysis","score":0.553600013256073},{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.5378999710083008},{"id":"https://openalex.org/keywords/association","display_name":"Association (psychology)","score":0.517300009727478},{"id":"https://openalex.org/keywords/dual","display_name":"Dual (grammatical number)","score":0.47440001368522644},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4383000135421753},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.4287000000476837},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.42489999532699585}],"concepts":[{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.729200005531311},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7070000171661377},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.6736999750137329},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.6090999841690063},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.572700023651123},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.553600013256073},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.5378999710083008},{"id":"https://openalex.org/C142853389","wikidata":"https://www.wikidata.org/wiki/Q744778","display_name":"Association (psychology)","level":2,"score":0.517300009727478},{"id":"https://openalex.org/C2780980858","wikidata":"https://www.wikidata.org/wiki/Q110022","display_name":"Dual (grammatical number)","level":2,"score":0.47440001368522644},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4383000135421753},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.4287000000476837},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.42489999532699585},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.4023999869823456},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.40209999680519104},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.39579999446868896},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.374099999666214},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.37389999628067017},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3197999894618988},{"id":"https://openalex.org/C2780103172","wikidata":"https://www.wikidata.org/wiki/Q1309721","display_name":"Visual Objects","level":3,"score":0.31929999589920044},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3125999867916107},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3061000108718872},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.30570000410079956},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.29350000619888306},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.28439998626708984},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.28279998898506165},{"id":"https://openalex.org/C153938966","wikidata":"https://www.wikidata.org/wiki/Q3348148","display_name":"Object-based spatial database","level":4,"score":0.2827000021934509},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.27300000190734863},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.26649999618530273},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.26649999618530273},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.26170000433921814},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2612999975681305},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.25929999351501465},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.2549999952316284},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.25360000133514404},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.25220000743865967}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.22278","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22278","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.22278","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22278","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.40299171209335327,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Many":[0],"multimodal":[1],"tasks,":[2],"such":[3,31],"as":[4],"image":[5,142],"captioning":[6],"and":[7,21,29,104,174],"visual":[8,68,121],"question":[9],"answering,":[10],"require":[11],"vision-language":[12],"models":[13,150],"(VLMs)":[14],"to":[15,49,71],"bind":[16],"objects":[17,103],"with":[18],"their":[19],"properties":[20],"spatial":[22,51,63,90,115,137,145,167],"relations.":[23],"Yet":[24],"it":[25],"remains":[26],"unclear":[27],"where":[28],"how":[30,166],"associations":[32],"are":[33,105],"computed":[34,171],"within":[35,172],"VLMs.":[36],"In":[37,54],"this":[38,74,114],"work,":[39],"we":[40],"show":[41,132],"that":[42,133],"VLMs":[43,173],"rely":[44],"on":[45,65,154],"two":[46],"concurrent":[47],"mechanisms":[48],"represent":[50,61],"variable":[52,146,168],"binding.":[53],"the":[55,86,94,100,109,159,176],"language":[56,110],"model":[57,83,111],"backbone,":[58],"intermediate":[59],"layers":[60],"content-independent":[62],"relations":[64],"top":[66],"of":[67,89,102,151,179],"tokens":[69,143],"corresponding":[70],"objects.":[72],"However,":[73],"mechanism":[75],"plays":[76],"only":[77],"a":[78],"secondary":[79],"role":[80,178],"in":[81,93,182],"shaping":[82],"predictions.":[84],"Instead,":[85],"dominant":[87],"source":[88],"information":[91],"originates":[92],"vision":[95,180],"encoder,":[96],"whose":[97],"representations":[98,138],"encode":[99],"layout":[101],"directly":[106],"exploited":[107],"by":[108],"backbone.":[112],"Notably,":[113],"signal":[116],"is":[117,170],"distributed":[118],"globally":[119,139],"across":[120,140,149],"tokens,":[122],"extending":[123],"beyond":[124],"object":[125],"regions":[126],"into":[127],"surrounding":[128],"background":[129],"areas.":[130],"We":[131],"enhancing":[134],"these":[135],"vision-derived":[136],"all":[141],"improves":[144],"binding":[147,169],"performance":[148],"various":[152],"sizes":[153],"complex":[155],"natural":[156],"images":[157],"from":[158],"COCO":[160],"datasets.":[161],"Together,":[162],"our":[163],"results":[164],"clarify":[165],"highlight":[175],"central":[177],"encoders":[181],"enabling":[183],"it.":[184]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-25T00:00:00"}
