{"id":"https://openalex.org/W7131081202","doi":"https://doi.org/10.48550/arxiv.2602.18196","title":"RAT+: Train Dense, Infer Sparse -- Recurrence Augmented Attention for Dilated Inference","display_name":"RAT+: Train Dense, Infer Sparse -- Recurrence Augmented Attention for Dilated Inference","publication_year":2026,"publication_date":"2026-02-20","ids":{"openalex":"https://openalex.org/W7131081202","doi":"https://doi.org/10.48550/arxiv.2602.18196"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.18196","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126602799","display_name":"Xiuying Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wei, Xiuying","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126653781","display_name":"Caglar Gulcehre","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gulcehre, Caglar","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5126602799"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.310699999332428,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.310699999332428,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.14180000126361847,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.13210000097751617,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7102000117301941},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.4449999928474426},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.41290000081062317},{"id":"https://openalex.org/keywords/dilation","display_name":"Dilation (metric space)","score":0.37599998712539673},{"id":"https://openalex.org/keywords/mode","display_name":"Mode (computer interface)","score":0.3750999867916107},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.37290000915527344},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.3197000026702881},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.30970001220703125}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7208999991416931},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7102000117301941},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6207000017166138},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.4449999928474426},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.41290000081062317},{"id":"https://openalex.org/C2780757906","wikidata":"https://www.wikidata.org/wiki/Q5276676","display_name":"Dilation (metric space)","level":2,"score":0.37599998712539673},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.3750999867916107},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.37290000915527344},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.33730000257492065},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.3197000026702881},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.30970001220703125},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.29899999499320984},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2989000082015991},{"id":"https://openalex.org/C115051666","wikidata":"https://www.wikidata.org/wiki/Q6522493","display_name":"Ranging","level":2,"score":0.29580000042915344},{"id":"https://openalex.org/C189950617","wikidata":"https://www.wikidata.org/wiki/Q937228","display_name":"Property (philosophy)","level":2,"score":0.2896000146865845},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.2727999985218048},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.26330000162124634},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.26269999146461487},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.2624000012874603},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.26170000433921814},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.26159998774528503},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.2599000036716461},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.2551000118255615}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.18196","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.18196","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.18196","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.18196","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Structured":[0],"dilated":[1,49,87],"attention":[2,15,45,64,88,142],"has":[3],"an":[4],"appealing":[5],"inference-time":[6],"efficiency":[7],"knob:":[8],"it":[9],"reduces":[10],"the":[11,14,17,25,146,161],"FLOPs":[12],"of":[13,24,40],"and":[16,68,124,135,156,159],"KV":[18],"cache":[19],"size":[20,27],"by":[21,126],"a":[22,36,43,48,59,99],"factor":[23],"dilation":[26],"D,":[28],"while":[29],"preserving":[30],"long-range":[31],"connectivity.":[32],"However,":[33],"we":[34],"find":[35],"persistent":[37],"failure":[38],"mode":[39],"them":[41],"-sparsifying":[42],"pretrained":[44,77],"model":[46,75],"to":[47,52,86,145,153],"pattern":[50],"leads":[51],"severe":[53],"accuracy":[54,121],"degradation.":[55],"We":[56,150],"introduce":[57],"RAT+,":[58],"dense-pretraining":[60],"architecture":[61],"that":[62],"augments":[63],"with":[65,90],"full-sequence":[66],"recurrence":[67,70],"active":[69],"learning.":[71],"A":[72],"single":[73],"RAT+":[74,117,140],"is":[76,165],"densely":[78],"once,":[79],"then":[80],"flexibly":[81],"switched":[82],"at":[83,122,130,167],"inference":[84],"time":[85],"(optionally":[89],"local":[91],"windows)":[92],"or":[93],"hybrid":[94],"layer/head":[95],"compositions,":[96],"requiring":[97],"only":[98],"short":[100],"1B-token":[101],"resolution":[102],"adaptation":[103],"rather":[104],"than":[105],"retraining":[106],"separate":[107],"sparse":[108],"models.":[109],"At":[110],"1.5B":[111],"parameters":[112,155],"trained":[113],"on":[114,132],"100B":[115],"tokens,":[116],"closely":[118],"matches":[119],"dense":[120],"D=16":[123],"drops":[125],"about":[127],"2-3":[128],"points":[129],"D=64":[131],"commonsense":[133],"reasoning":[134],"LongBench":[136],"tasks,":[137],"respectively.":[138],"Moreover,":[139],"outperforms":[141],"when":[143],"sparsifying":[144],"top-k":[147],"block":[148],"attention.":[149],"further":[151],"scale":[152],"2.6B":[154],"200B":[157],"tokens":[158],"observe":[160],"same":[162],"trend.":[163],"Code":[164],"available":[166],"https://github.com/wimh966/rat-plus.":[168]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-24T00:00:00"}
