{"id":"https://openalex.org/W7152093627","doi":"https://doi.org/10.48550/arxiv.2604.05090","title":"Multilingual Language Models Encode Script Over Linguistic Structure","display_name":"Multilingual Language Models Encode Script Over Linguistic Structure","publication_year":2026,"publication_date":"2026-04-06","ids":{"openalex":"https://openalex.org/W7152093627","doi":"https://doi.org/10.48550/arxiv.2604.05090"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.05090","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.05090","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.05090","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133204861","display_name":"Aastha A K Verma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Verma, Aastha A K","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104707682","display_name":"Anwoy Chatterjee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chatterjee, Anwoy","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133182248","display_name":"Mehak Gupta","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gupta, Mehak","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133221019","display_name":"Tanmoy Chakraborty","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chakraborty, Tanmoy","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10465","display_name":"Neurobiology of Language and Bilingualism","score":0.41130000352859497,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10465","display_name":"Neurobiology of Language and Bilingualism","score":0.41130000352859497,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.2939000129699707,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.028300000354647636,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/abstraction","display_name":"Abstraction","score":0.5202000141143799},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.45660001039505005},{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.4318999946117401},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.3312999904155731},{"id":"https://openalex.org/keywords/invariant","display_name":"Invariant (physics)","score":0.3240000009536743},{"id":"https://openalex.org/keywords/constructed-language","display_name":"Constructed language","score":0.3163999915122986},{"id":"https://openalex.org/keywords/theoretical-linguistics","display_name":"Theoretical linguistics","score":0.29760000109672546}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6133999824523926},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5990999937057495},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.5594000220298767},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.5202000141143799},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.506600022315979},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.45660001039505005},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.4318999946117401},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.3312999904155731},{"id":"https://openalex.org/C190470478","wikidata":"https://www.wikidata.org/wiki/Q2370229","display_name":"Invariant (physics)","level":2,"score":0.3240000009536743},{"id":"https://openalex.org/C94922259","wikidata":"https://www.wikidata.org/wiki/Q33215","display_name":"Constructed language","level":2,"score":0.3163999915122986},{"id":"https://openalex.org/C153578388","wikidata":"https://www.wikidata.org/wiki/Q351625","display_name":"Theoretical linguistics","level":2,"score":0.29760000109672546},{"id":"https://openalex.org/C167927819","wikidata":"https://www.wikidata.org/wiki/Q1930567","display_name":"Shuffling","level":2,"score":0.2921000123023987},{"id":"https://openalex.org/C106301342","wikidata":"https://www.wikidata.org/wiki/Q4117933","display_name":"Entropy (arrow of time)","level":2,"score":0.2827000021934509},{"id":"https://openalex.org/C2987567764","wikidata":"https://www.wikidata.org/wiki/Q125421","display_name":"Second language","level":2,"score":0.2809000015258789},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2759999930858612},{"id":"https://openalex.org/C9679016","wikidata":"https://www.wikidata.org/wiki/Q1417473","display_name":"Principle of maximum entropy","level":2,"score":0.2750999927520752},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.25920000672340393},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.25839999318122864},{"id":"https://openalex.org/C172205122","wikidata":"https://www.wikidata.org/wiki/Q777864","display_name":"Linguistic description","level":2,"score":0.25060001015663147},{"id":"https://openalex.org/C129792486","wikidata":"https://www.wikidata.org/wiki/Q1050419","display_name":"Language identification","level":3,"score":0.25049999356269836}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.05090","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.05090","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.05090","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.05090","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7000312805175781,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multilingual":[0],"language":[1,36],"models":[2],"(LMs)":[3],"organize":[4,148],"representations":[5,86,149],"for":[6],"typologically":[7],"and":[8,56,66],"orthographically":[9],"diverse":[10],"languages":[11],"into":[12,160],"a":[13,161],"shared":[14],"parameter":[15],"space,":[16],"yet":[17],"the":[18,59],"nature":[19],"of":[20],"this":[21,27],"internal":[22],"organization":[23],"remains":[24],"elusive.":[25],"In":[26],"work,":[28],"we":[29,48],"investigate":[30],"which":[31],"linguistic":[32,154],"properties":[33],"-":[34,41],"abstract":[35],"identity":[37],"or":[38],"surface-form":[39,130],"cues":[40],"shape":[42],"multilingual":[43,146],"representations.":[44],"To":[45],"do":[46],"so,":[47],"analyze":[49],"language-associated":[50],"units":[51,77,125,135],"across":[52],"different":[53],"model":[54],"families":[55],"scales":[57],"using":[58],"Language":[60],"Activation":[61],"Probability":[62],"Entropy":[63],"(LAPE)":[64],"metric,":[65],"further":[67],"decompose":[68],"activations":[69],"with":[70,89,153],"Sparse":[71],"Autoencoders.":[72],"We":[73],"find":[74],"that":[75,87,106,119,126,145],"these":[76],"are":[78,127],"strongly":[79],"conditioned":[80],"on":[81,101],"orthography:":[82],"romanization":[83],"induces":[84],"near-disjoint":[85],"align":[88],"neither":[90],"native-script":[91],"inputs":[92],"nor":[93],"English,":[94],"while":[95,115],"word-order":[96],"shuffling":[97],"has":[98],"limited":[99],"effect":[100],"unit":[102],"identity.":[103],"Probing":[104],"shows":[105],"typological":[107,138],"structure":[108],"becomes":[109],"increasingly":[110],"accessible":[111],"in":[112],"deeper":[113],"layers,":[114],"causal":[116],"interventions":[117],"indicate":[118],"generation":[120],"is":[121],"most":[122],"sensitive":[123],"to":[124,129,134],"invariant":[128],"perturbations":[131],"rather":[132],"than":[133],"identified":[136],"by":[137],"alignment":[139],"alone.":[140],"Overall,":[141],"our":[142],"results":[143],"suggest":[144],"LMs":[147],"around":[150],"surface":[151],"form,":[152],"abstraction":[155],"emerging":[156],"gradually":[157],"without":[158],"collapsing":[159],"unified":[162],"interlingua.":[163]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-09T00:00:00"}
