{"id":"https://openalex.org/W7128818591","doi":"https://doi.org/10.48550/arxiv.2602.12204","title":"Learning to Forget Attention: Memory Consolidation for Adaptive Compute Reduction","display_name":"Learning to Forget Attention: Memory Consolidation for Adaptive Compute Reduction","publication_year":2026,"publication_date":"2026-02-12","ids":{"openalex":"https://openalex.org/W7128818591","doi":"https://doi.org/10.48550/arxiv.2602.12204"},"language":"en","primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.12204","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125935149","display_name":"Ibne Farabi Shihab","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shihab, Ibne Farabi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119832027","display_name":"Sanjeda Akter","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Akter, Sanjeda","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125918016","display_name":"Anuj Sharma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sharma, Anuj","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5125935149"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.13429999351501465,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.13429999351501465,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.11840000003576279,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10429","display_name":"EEG and Brain-Computer Interfaces","score":0.1145000010728836,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.6157000064849854},{"id":"https://openalex.org/keywords/episodic-memory","display_name":"Episodic memory","score":0.4442000091075897},{"id":"https://openalex.org/keywords/cognition","display_name":"Cognition","score":0.41510000824928284},{"id":"https://openalex.org/keywords/consolidation","display_name":"Consolidation (business)","score":0.4009000062942505},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.3873000144958496},{"id":"https://openalex.org/keywords/memory-model","display_name":"Memory model","score":0.3643999993801117},{"id":"https://openalex.org/keywords/memory-consolidation","display_name":"Memory consolidation","score":0.3578000068664551},{"id":"https://openalex.org/keywords/parametric-statistics","display_name":"Parametric statistics","score":0.3522999882698059}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6797999739646912},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.6157000064849854},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5170000195503235},{"id":"https://openalex.org/C88576662","wikidata":"https://www.wikidata.org/wiki/Q18646","display_name":"Episodic memory","level":3,"score":0.4442000091075897},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.41510000824928284},{"id":"https://openalex.org/C2776014549","wikidata":"https://www.wikidata.org/wiki/Q3050847","display_name":"Consolidation (business)","level":2,"score":0.4009000062942505},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.3873000144958496},{"id":"https://openalex.org/C12186640","wikidata":"https://www.wikidata.org/wiki/Q6815743","display_name":"Memory model","level":3,"score":0.3643999993801117},{"id":"https://openalex.org/C48455012","wikidata":"https://www.wikidata.org/wiki/Q2892593","display_name":"Memory consolidation","level":3,"score":0.3578000068664551},{"id":"https://openalex.org/C117251300","wikidata":"https://www.wikidata.org/wiki/Q1849855","display_name":"Parametric statistics","level":2,"score":0.3522999882698059},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.34619998931884766},{"id":"https://openalex.org/C2985957978","wikidata":"https://www.wikidata.org/wiki/Q492","display_name":"Human memory","level":3,"score":0.34279999136924744},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.33739998936653137},{"id":"https://openalex.org/C118702147","wikidata":"https://www.wikidata.org/wiki/Q189396","display_name":"Dynamic random-access memory","level":3,"score":0.3255999982357025},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.26249998807907104},{"id":"https://openalex.org/C87868495","wikidata":"https://www.wikidata.org/wiki/Q750843","display_name":"Information processing","level":2,"score":0.257099986076355},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.25690001249313354},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.2567000091075897},{"id":"https://openalex.org/C21963081","wikidata":"https://www.wikidata.org/wiki/Q11337567","display_name":"Working memory","level":3,"score":0.2524000108242035}],"mesh":[],"locations_count":3,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.12204","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"pmh:oai:dr.lib.iastate.edu:20.500.12876/YvkA9d6z","is_oa":false,"landing_page_url":"https://dr.lib.iastate.edu/handle/20.500.12876/YvkA9d6z","pdf_url":null,"source":{"id":"https://openalex.org/S4377196104","display_name":"Iowa State University Digital Repository (Iowa State University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I173911158","host_organization_name":"Iowa State University","host_organization_lineage":["https://openalex.org/I173911158"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"https://doi.org/10.48550/arXiv.2602.12204","raw_type":"Preprint"},{"id":"doi:10.48550/arxiv.2602.12204","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.12204","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.12204","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Hybrid":[0],"architectures":[1],"combining":[2],"state-space":[3],"models":[4],"with":[5,142,173],"attention":[6,17,51,100,105,139,160,175],"have":[7],"achieved":[8],"strong":[9],"efficiency-quality":[10],"tradeoffs,":[11],"yet":[12],"existing":[13],"approaches":[14],"either":[15],"apply":[16],"uniformly":[18],"or":[19],"learn":[20],"static":[21,132],"sparse":[22,99],"patterns.":[23],"This":[24],"misses":[25],"a":[26,42,82,110,114],"key":[27],"opportunity:":[28],"\\emph{attention":[29],"demand":[30],"should":[31],"decrease":[32,67],"over":[33,107],"time":[34],"as":[35],"recurring":[36,143],"patterns":[37,144,168],"become":[38],"familiar}.":[39],"We":[40,122],"present":[41],"surprising":[43],"finding":[44],"from":[45,57,191],"analyzing":[46],"GPT-2":[47],"models:":[48],"\\textbf{88\\%}":[49],"of":[50,145],"operations":[52],"retrieve":[53],"information":[54],"already":[55],"predictable":[56],"the":[58,180],"model's":[59],"hidden":[60],"state,":[61],"and":[62,166,201],"this":[63,72,125],"redundancy":[64],"does":[65],"\\emph{not}":[66],"during":[68],"training.":[69],"Motivated":[70],"by":[71],"observation,":[73],"we":[74],"introduce":[75],"\\textbf{\\ours{}}":[76],"(\\textbf{C}onsolidation-based":[77],"\\textbf{R}outing":[78],"for":[79,140,164],"\\textbf{A}daptive":[80],"\\textbf{M}emory),":[81],"biologically":[83],"inspired":[84],"memory":[85,188],"consolidation":[86,182],"mechanism":[87],"that":[88,124],"gradually":[89],"distills":[90],"episodic":[91],"retrievals":[92],"into":[93],"parametric":[94],"semantic":[95],"memory.":[96],"Unlike":[97],"prior":[98],"methods,":[101],"\\ours{}":[102,153],"exhibits":[103],"\\emph{decreasing":[104],"utilization}":[106],"training,":[108],"achieving":[109],"\\textbf{37.8$\\times$}":[111],"reduction":[112,176],"through":[113],"sharp":[115],"phase":[116],"transition":[117,189],"at":[118,158,205],"approximately":[119],"3K":[120],"steps.":[121],"prove":[123],"capability":[126],"is":[127],"\\emph{impossible}":[128],"without":[129,177],"consolidation:":[130],"any":[131],"routing":[133],"scheme":[134],"requires":[135],"$\u03a9(f":[136],"\\cdot":[137],"n)$":[138],"tasks":[141,172],"frequency":[146],"$f$.":[147],"On":[148],"our":[149],"proposed":[150],"SRCD":[151],"benchmark,":[152],"achieves":[154],"\\textbf{100\\%":[155],"retrieval":[156],"accuracy}":[157],"1.6\\%":[159],"compute":[161],"(vs.\\":[162],"68\\%":[163],"baselines),":[165],"consolidated":[167],"transfer":[169],"to":[170],"unseen":[171],"\\textbf{48--52\\%}":[174],"retraining.":[178],"Remarkably,":[179],"learned":[181],"dynamics":[183],"quantitatively":[184],"match":[185],"human":[186],"episodic-to-semantic":[187],"curves":[190],"cognitive":[192],"psychology":[193],"($\u03b3=":[194],"0.43$":[195],"vs.\\":[196],"$\u03b3_{\\text{human}}":[197],"\\approx":[198],"0.4$--$0.5$).":[199],"Code":[200],"benchmarks":[202],"are":[203],"available":[204],"[anonymized].":[206]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-14T00:00:00"}
