{"id":"https://openalex.org/W7148313504","doi":"https://doi.org/10.48550/arxiv.2604.00004","title":"LinearARD: Linear-Memory Attention Distillation for RoPE Restoration","display_name":"LinearARD: Linear-Memory Attention Distillation for RoPE Restoration","publication_year":2026,"publication_date":"2026-03-09","ids":{"openalex":"https://openalex.org/W7148313504","doi":"https://doi.org/10.48550/arxiv.2604.00004"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.00004","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00004","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.00004","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132828514","display_name":"Ning Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yang, Ning","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128759245","display_name":"Hengyu Zhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhong, Hengyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132827315","display_name":"Wentao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Wentao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132750135","display_name":"Baoliang Tian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Baoliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132818423","display_name":"Haijun Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Haijun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132822655","display_name":"Jun Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jun","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5132828514"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.3418000042438507,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.3418000042438507,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.2711000144481659,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.11299999803304672,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5975000262260437},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5491999983787537},{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.5254999995231628},{"id":"https://openalex.org/keywords/rope","display_name":"Rope","score":0.47940000891685486},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.46939998865127563},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.4490000009536743},{"id":"https://openalex.org/keywords/quadratic-equation","display_name":"Quadratic equation","score":0.44339999556541443},{"id":"https://openalex.org/keywords/relation","display_name":"Relation (database)","score":0.44130000472068787},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.42730000615119934},{"id":"https://openalex.org/keywords/divergence","display_name":"Divergence (linguistics)","score":0.41029998660087585}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7192999720573425},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5975000262260437},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5491999983787537},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.5254999995231628},{"id":"https://openalex.org/C162269090","wikidata":"https://www.wikidata.org/wiki/Q1156047","display_name":"Rope","level":2,"score":0.47940000891685486},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.46939998865127563},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.4490000009536743},{"id":"https://openalex.org/C129844170","wikidata":"https://www.wikidata.org/wiki/Q41299","display_name":"Quadratic equation","level":2,"score":0.44339999556541443},{"id":"https://openalex.org/C25343380","wikidata":"https://www.wikidata.org/wiki/Q277521","display_name":"Relation (database)","level":2,"score":0.44130000472068787},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.42730000615119934},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.41029998660087585},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4083999991416931},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.40709999203681946},{"id":"https://openalex.org/C198082294","wikidata":"https://www.wikidata.org/wiki/Q3399648","display_name":"Position (finance)","level":2,"score":0.40380001068115234},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39100000262260437},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3846000134944916},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.3837999999523163},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.31630000472068787},{"id":"https://openalex.org/C19743564","wikidata":"https://www.wikidata.org/wiki/Q25378119","display_name":"Flicker","level":2,"score":0.3052000105381012},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3037000000476837},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.303600013256073},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.295199990272522},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.29330000281333923},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.29269999265670776},{"id":"https://openalex.org/C71134354","wikidata":"https://www.wikidata.org/wiki/Q458825","display_name":"Kernel density estimation","level":3,"score":0.29269999265670776},{"id":"https://openalex.org/C171937826","wikidata":"https://www.wikidata.org/wiki/Q48378","display_name":"Pulley","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C43711488","wikidata":"https://www.wikidata.org/wiki/Q7534783","display_name":"Skew","level":2,"score":0.2890999913215637},{"id":"https://openalex.org/C111696304","wikidata":"https://www.wikidata.org/wiki/Q2303697","display_name":"Sorting","level":2,"score":0.28519999980926514},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.26910001039505005},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.2687000036239624},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.2662999927997589},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.2648000121116638},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.258899986743927},{"id":"https://openalex.org/C2778029271","wikidata":"https://www.wikidata.org/wiki/Q5421931","display_name":"Extension (predicate logic)","level":2,"score":0.25279998779296875},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.2524999976158142},{"id":"https://openalex.org/C21308566","wikidata":"https://www.wikidata.org/wiki/Q7169365","display_name":"Permutation (music)","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.00004","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00004","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.00004","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00004","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"extension":[1],"of":[2,75,94,136,140],"context":[3],"windows":[4],"in":[5],"Large":[6],"Language":[7],"Models":[8],"is":[9,172],"typically":[10],"facilitated":[11],"by":[12,17,166],"scaling":[13],"positional":[14],"encodings":[15],"followed":[16],"lightweight":[18],"Continual":[19],"Pre-Training":[20],"(CPT).":[21],"While":[22],"effective":[23],"for":[24],"processing":[25],"long":[26],"sequences,":[27],"this":[28],"paradigm":[29],"often":[30],"disrupts":[31],"original":[32],"model":[33],"capabilities,":[34],"leading":[35],"to":[36,83,119,131,161],"performance":[37,139],"degradation":[38],"on":[39,146],"standard":[40],"short-text":[41,138],"benchmarks.":[42,148],"We":[43],"propose":[44],"LinearARD,":[45],"a":[46,60,102],"self-distillation":[47],"method":[48,151],"that":[49],"restores":[50],"Rotary":[51],"Position":[52],"Embeddings":[53],"(RoPE)-scaled":[54],"students":[55],"through":[56],"attention-structure":[57],"consistency":[58],"with":[59],"frozen":[61],"native-RoPE":[62],"teacher.":[63],"Rather":[64],"than":[65],"matching":[66],"opaque":[67],"hidden":[68],"states,":[69],"LinearARD":[70,133],"aligns":[71],"the":[72,90,116,137,162],"row-wise":[73],"distributions":[74],"dense":[76],"$Q/Q$,":[77],"$K/K$,":[78],"and":[79,111,124,168],"$V/V$":[80],"self-relation":[81],"matrices":[82],"directly":[84],"supervise":[85],"attention":[86],"dynamics.":[87],"To":[88],"overcome":[89],"quadratic":[91],"memory":[92],"bottleneck":[93],"$n":[95],"\\times":[96],"n$":[97],"relation":[98],"maps,":[99],"we":[100],"introduce":[101],"linear-memory":[103],"kernel.":[104],"This":[105],"kernel":[106],"leverages":[107],"per-token":[108],"log-sum-exp":[109],"statistics":[110],"fuses":[112],"logit":[113],"recomputation":[114],"into":[115],"backward":[117],"pass":[118],"compute":[120],"exact":[121],"Kullback-Leibler":[122],"divergence":[123],"gradients.":[125],"On":[126],"LLaMA2-7B":[127],"extended":[128],"from":[129],"4K":[130],"32K,":[132],"recovers":[134],"98.3\\%":[135],"state-of-the-art":[141],"baselines":[142],"while":[143],"surpassing":[144],"them":[145],"long-context":[147],"Notably,":[149],"our":[150],"achieves":[152],"these":[153],"results":[154],"using":[155],"only":[156],"\\textbf{4.25M}":[157],"training":[158],"tokens":[159,164],"compared":[160],"\\textbf{256M}":[163],"required":[165],"LongReD":[167],"CPT.":[169],"Our":[170],"code":[171],"available":[173],"at":[174],"https://github.com/gracefulning/LinearARD.":[175]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
