{"id":"https://openalex.org/W7155052370","doi":"https://doi.org/10.48550/arxiv.2604.18572","title":"Back into Plato's Cave: Examining Cross-modal Representational Convergence at Scale","display_name":"Back into Plato's Cave: Examining Cross-modal Representational Convergence at Scale","publication_year":2026,"publication_date":"2026-04-20","ids":{"openalex":"https://openalex.org/W7155052370","doi":"https://doi.org/10.48550/arxiv.2604.18572"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.18572","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.18572","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.18572","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134163397","display_name":"A. Sophia Koepke","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Koepke, A. Sophia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120003873","display_name":"Daniil Zverev","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zverev, Daniil","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009491631","display_name":"Shiry Ginosar","orcid":"https://orcid.org/0000-0002-7362-1401"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ginosar, Shiry","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5077436220","display_name":"Alexei A. Efros","orcid":"https://orcid.org/0000-0001-5720-8070"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Efros, Alexei A.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5134163397"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.2020999938249588,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.2020999938249588,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10465","display_name":"Neurobiology of Language and Bilingualism","score":0.11860000342130661,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.06809999793767929,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/constraint","display_name":"Constraint (computer-aided design)","score":0.682200014591217},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.6765000224113464},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.66839998960495},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.6653000116348267},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.46779999136924744},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.412200003862381},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.3928000032901764},{"id":"https://openalex.org/keywords/deep-neural-networks","display_name":"Deep neural networks","score":0.39079999923706055}],"concepts":[{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.682200014591217},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.6765000224113464},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.66839998960495},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.6653000116348267},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6029999852180481},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5746999979019165},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.46779999136924744},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.45590001344680786},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.412200003862381},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3928000032901764},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.39079999923706055},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.36899998784065247},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3659999966621399},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.35830000042915344},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3578999936580658},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3073999881744385},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.2842000126838684},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2822999954223633},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.28189998865127563},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.27790001034736633},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.2621999979019165},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.2572999894618988},{"id":"https://openalex.org/C71889745","wikidata":"https://www.wikidata.org/wiki/Q1783264","display_name":"Counterfactual conditional","level":3,"score":0.25360000133514404}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.18572","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.18572","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.18572","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.18572","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.40686488151550293,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0,81,93],"Platonic":[1],"Representation":[2],"Hypothesis":[3],"suggests":[4],"that":[5,41,95,125,140,164],"neural":[6],"networks":[7],"trained":[8,184],"on":[9,53,64,185],"different":[10,186],"modalities":[11,187],"(e.g.,":[12],"text":[13],"and":[14,17,50,69,90,132],"images)":[15],"align":[16],"eventually":[18],"converge":[19],"toward":[20],"the":[21,42,54,73,110,141,165,194,198],"same":[22,82,199],"representation":[23],"of":[24,79,144,193],"reality.":[25],"If":[26],"true,":[27],"this":[28,46],"has":[29],"significant":[30],"implications":[31],"for":[32,45,88,157,168],"whether":[33],"modality":[34],"choice":[35],"matters":[36],"at":[37],"all.":[38],"We":[39,137],"show":[40],"experimental":[43],"evidence":[44,167],"hypothesis":[47],"is":[48,58,75,84,172],"fragile":[49],"depends":[51],"critically":[52],"evaluation":[55],"regime.":[56],"Alignment":[57],"measured":[59,135],"using":[60],"mutual":[61],"nearest":[62],"neighbors":[63],"small":[65],"datasets":[66],"($\\approx$1K":[67],"samples)":[68],"degrades":[70],"substantially":[71],"as":[72],"dataset":[74],"scaled":[76],"to":[77,155,181],"millions":[78],"samples.":[80],"behavior":[83],"observed":[85],"beyond":[86],"text-image,":[87],"text-audio":[89],"text-video":[91],"alignment.":[92,136],"alignment":[94],"remains":[96],"between":[97],"model":[98],"representations":[99,192],"reflects":[100],"coarse":[101],"semantic":[102],"overlap":[103],"rather":[104],"than":[105,175],"consistent":[106],"fine-grained":[107],"structure.":[108],"Moreover,":[109],"evaluations":[111],"in":[112,118,128],"Huh":[113],"et":[114],"al.":[115],"are":[116],"done":[117],"a":[119,123],"one-to-one":[120],"image-caption":[121],"setting,":[122],"constraint":[124],"breaks":[126],"down":[127],"realistic":[129],"many-to-many":[130],"settings":[131],"further":[133],"reduces":[134],"also":[138],"find":[139],"reported":[142],"trend":[143],"stronger":[145],"language":[146],"models":[147],"increasingly":[148],"aligning":[149],"with":[150],"vision":[151],"does":[152],"not":[153,197],"appear":[154],"hold":[156],"newer":[158],"models.":[159],"Overall,":[160],"our":[161],"findings":[162],"suggest":[163],"current":[166],"cross-modal":[169],"representational":[170],"convergence":[171],"considerably":[173],"weaker":[174],"subsequent":[176],"works":[177],"have":[178],"taken":[179],"it":[180],"be.":[182],"Models":[183],"may":[188],"learn":[189],"equally":[190],"rich":[191],"world,":[195],"just":[196],"one.":[200]},"counts_by_year":[],"updated_date":"2026-06-04T06:14:18.571774","created_date":"2026-04-22T00:00:00"}
