{"id":"https://openalex.org/W7154612902","doi":"https://doi.org/10.48550/arxiv.2604.13082","title":"The Long Delay to Arithmetic Generalization: When Learned Representations Outrun Behavior","display_name":"The Long Delay to Arithmetic Generalization: When Learned Representations Outrun Behavior","publication_year":2026,"publication_date":"2026-03-30","ids":{"openalex":"https://openalex.org/W7154612902","doi":"https://doi.org/10.48550/arxiv.2604.13082"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.13082","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13082","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.13082","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133767259","display_name":"Laura Gomezjurado Gonzalez","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Gonzalez, Laura Gomezjurado","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5133767259"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13720","display_name":"Benford\u2019s Law and Fraud Detection","score":0.9535999894142151,"subfield":{"id":"https://openalex.org/subfields/2613","display_name":"Statistics and Probability"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13720","display_name":"Benford\u2019s Law and Fraud Detection","score":0.9535999894142151,"subfield":{"id":"https://openalex.org/subfields/2613","display_name":"Statistics and Probability"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.008799999952316284,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13650","display_name":"Computational Physics and Python Applications","score":0.002899999963119626,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/numeral-system","display_name":"Numeral system","score":0.5823000073432922},{"id":"https://openalex.org/keywords/binary-number","display_name":"Binary number","score":0.5055000185966492},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5038999915122986},{"id":"https://openalex.org/keywords/base","display_name":"Base (topology)","score":0.3864000141620636},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.3628999888896942},{"id":"https://openalex.org/keywords/factorization","display_name":"Factorization","score":0.33149999380111694},{"id":"https://openalex.org/keywords/modulo","display_name":"Modulo","score":0.2883000075817108}],"concepts":[{"id":"https://openalex.org/C94375191","wikidata":"https://www.wikidata.org/wiki/Q11205","display_name":"Arithmetic","level":1,"score":0.7365999817848206},{"id":"https://openalex.org/C204160518","wikidata":"https://www.wikidata.org/wiki/Q122653","display_name":"Numeral system","level":2,"score":0.5823000073432922},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5353000164031982},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.5055000185966492},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5038999915122986},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.45080000162124634},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.3864000141620636},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.375},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.3628999888896942},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35760000348091125},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.33329999446868896},{"id":"https://openalex.org/C187834632","wikidata":"https://www.wikidata.org/wiki/Q188804","display_name":"Factorization","level":2,"score":0.33149999380111694},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.30070000886917114},{"id":"https://openalex.org/C54732982","wikidata":"https://www.wikidata.org/wiki/Q1415345","display_name":"Modulo","level":2,"score":0.2883000075817108},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.27810001373291016},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.275299996137619},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.27390000224113464},{"id":"https://openalex.org/C55526617","wikidata":"https://www.wikidata.org/wiki/Q719375","display_name":"Operand","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C2777151079","wikidata":"https://www.wikidata.org/wiki/Q141160","display_name":"Parity (physics)","level":2,"score":0.2583000063896179},{"id":"https://openalex.org/C190290938","wikidata":"https://www.wikidata.org/wiki/Q387015","display_name":"Trie","level":3,"score":0.25769999623298645},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.25369998812675476}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.13082","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13082","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.13082","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13082","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7178103923797607,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Grokking":[0],"in":[1,51,202],"transformers":[2],"trained":[3,97,111],"on":[4,147],"algorithmic":[5],"tasks":[6],"is":[7],"characterized":[8],"by":[9,105],"a":[10,96,100,110,116],"long":[11],"delay":[12,24,36],"between":[13],"training-set":[14],"fit":[15],"and":[16,60,67,119,128,176],"abrupt":[17],"generalization,":[18],"but":[19],"the":[20,52,63,71,91,122,125,140,158,195,205],"source":[21],"of":[22,85,181],"that":[23,34,49,62,188],"remains":[25,80],"poorly":[26],"understood.":[27],"In":[28],"encoder-decoder":[29],"arithmetic":[30,161],"models,":[31],"we":[32],"argue":[33],"this":[35],"reflects":[37],"limited":[38],"access":[39],"to":[40,47,133],"already":[41],"learned":[42],"structure":[43,50,69,194],"rather":[44],"than":[45],"failure":[46],"acquire":[48],"first":[53,72],"place.":[54],"We":[55],"study":[56],"one-step":[57],"Collatz":[58,159],"prediction":[59],"find":[61],"encoder":[64,98,118],"organizes":[65],"parity":[66],"residue":[68],"within":[70],"few":[73],"thousand":[74],"training":[75],"steps,":[76],"while":[77,108,168],"output":[78],"accuracy":[79],"near":[81],"chance":[82],"for":[83,135],"tens":[84],"thousands":[86],"more.":[87],"Causal":[88],"interventions":[89],"support":[90],"decoder":[92,112,123,196],"bottleneck":[93],"hypothesis.":[94],"Transplanting":[95],"into":[99],"fresh":[101],"model":[102],"accelerates":[103],"grokking":[104],"2.75":[106],"times,":[107],"transplanting":[109],"actively":[113],"hurts.":[114],"Freezing":[115],"converged":[117],"retraining":[120],"only":[121],"eliminates":[124],"plateau":[126],"entirely":[127],"yields":[129],"97.6%":[130],"accuracy,":[131,167],"compared":[132],"86.1%":[134],"joint":[136],"training.":[137],"What":[138],"makes":[139],"decoder's":[141],"job":[142],"harder":[143],"or":[144],"easier":[145],"depends":[146],"numeral":[148],"representation.":[149],"Across":[150],"15":[151],"bases,":[152],"those":[153],"whose":[154],"factorization":[155],"aligns":[156],"with":[157],"map's":[160],"(e.g.,":[162],"base":[163,182],"24)":[164],"reach":[165],"99.8%":[166],"binary":[169],"fails":[170],"completely":[171],"because":[172],"its":[173],"representations":[174],"collapse":[175],"never":[177],"recover.":[178],"The":[179],"choice":[180],"acts":[183],"as":[184],"an":[185],"inductive":[186],"bias":[187],"controls":[189],"how":[190],"much":[191],"local":[192],"digit":[193],"can":[197],"exploit,":[198],"producing":[199],"large":[200],"differences":[201],"learnability":[203],"from":[204],"same":[206],"underlying":[207],"task.":[208]},"counts_by_year":[],"updated_date":"2026-04-17T06:04:52.305304","created_date":"2026-04-17T00:00:00"}
