{"id":"https://openalex.org/W7133308323","doi":"https://doi.org/10.48550/arxiv.2603.00175","title":"Self-Attention And Beyond the Infinite: Towards Linear Transformers with Infinite Self-Attention","display_name":"Self-Attention And Beyond the Infinite: Towards Linear Transformers with Infinite Self-Attention","publication_year":2026,"publication_date":"2026-02-26","ids":{"openalex":"https://openalex.org/W7133308323","doi":"https://doi.org/10.48550/arxiv.2603.00175"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.00175","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00175","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.00175","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5107077508","display_name":"Giorgio Roffo","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Roffo, Giorgio","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128641196","display_name":"Luke Palmer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abdelkawy, Hazem","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Lavie, Nilli","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lavie, Nilli","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Palmer, Luke","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Palmer, Luke","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5107077508"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.41920000314712524,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.41920000314712524,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.19050000607967377,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.08479999750852585,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/softmax-function","display_name":"Softmax function","score":0.636900007724762},{"id":"https://openalex.org/keywords/quadratic-equation","display_name":"Quadratic equation","score":0.5580000281333923},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4763000011444092},{"id":"https://openalex.org/keywords/eigenvalues-and-eigenvectors","display_name":"Eigenvalues and eigenvectors","score":0.4575999975204468},{"id":"https://openalex.org/keywords/operator","display_name":"Operator (biology)","score":0.4174000024795532},{"id":"https://openalex.org/keywords/markov-chain","display_name":"Markov chain","score":0.38909998536109924},{"id":"https://openalex.org/keywords/hilbert-space","display_name":"Hilbert space","score":0.3799000084400177},{"id":"https://openalex.org/keywords/fixed-point","display_name":"Fixed point","score":0.3571000099182129},{"id":"https://openalex.org/keywords/countable-set","display_name":"Countable set","score":0.35359999537467957},{"id":"https://openalex.org/keywords/flops","display_name":"FLOPS","score":0.34940001368522644}],"concepts":[{"id":"https://openalex.org/C188441871","wikidata":"https://www.wikidata.org/wiki/Q7554146","display_name":"Softmax function","level":3,"score":0.636900007724762},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.6344000101089478},{"id":"https://openalex.org/C129844170","wikidata":"https://www.wikidata.org/wiki/Q41299","display_name":"Quadratic equation","level":2,"score":0.5580000281333923},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4763000011444092},{"id":"https://openalex.org/C158693339","wikidata":"https://www.wikidata.org/wiki/Q190524","display_name":"Eigenvalues and eigenvectors","level":2,"score":0.4575999975204468},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.4465999901294708},{"id":"https://openalex.org/C118615104","wikidata":"https://www.wikidata.org/wiki/Q121416","display_name":"Discrete mathematics","level":1,"score":0.43380001187324524},{"id":"https://openalex.org/C17020691","wikidata":"https://www.wikidata.org/wiki/Q139677","display_name":"Operator (biology)","level":5,"score":0.4174000024795532},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.41200000047683716},{"id":"https://openalex.org/C98763669","wikidata":"https://www.wikidata.org/wiki/Q176645","display_name":"Markov chain","level":2,"score":0.38909998536109924},{"id":"https://openalex.org/C62799726","wikidata":"https://www.wikidata.org/wiki/Q190056","display_name":"Hilbert space","level":2,"score":0.3799000084400177},{"id":"https://openalex.org/C61445026","wikidata":"https://www.wikidata.org/wiki/Q217608","display_name":"Fixed point","level":2,"score":0.3571000099182129},{"id":"https://openalex.org/C110729354","wikidata":"https://www.wikidata.org/wiki/Q185478","display_name":"Countable set","level":2,"score":0.35359999537467957},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.34940001368522644},{"id":"https://openalex.org/C77246614","wikidata":"https://www.wikidata.org/wiki/Q1409400","display_name":"Gramian matrix","level":3,"score":0.3488999903202057},{"id":"https://openalex.org/C48406656","wikidata":"https://www.wikidata.org/wiki/Q534112","display_name":"Martingale (probability theory)","level":2,"score":0.3361000120639801},{"id":"https://openalex.org/C159886148","wikidata":"https://www.wikidata.org/wiki/Q176645","display_name":"Markov process","level":2,"score":0.33320000767707825},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.329800009727478},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.322299987077713},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.3181000053882599},{"id":"https://openalex.org/C80469333","wikidata":"https://www.wikidata.org/wiki/Q189088","display_name":"Von Neumann architecture","level":2,"score":0.3086000084877014},{"id":"https://openalex.org/C42747912","wikidata":"https://www.wikidata.org/wiki/Q1048447","display_name":"Multiplicative function","level":2,"score":0.30559998750686646},{"id":"https://openalex.org/C117251300","wikidata":"https://www.wikidata.org/wiki/Q1849855","display_name":"Parametric statistics","level":2,"score":0.303600013256073},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.2985999882221222},{"id":"https://openalex.org/C53811970","wikidata":"https://www.wikidata.org/wiki/Q5062194","display_name":"Centrality","level":2,"score":0.2971999943256378},{"id":"https://openalex.org/C166437778","wikidata":"https://www.wikidata.org/wiki/Q50695","display_name":"Quadratic function","level":3,"score":0.27950000762939453},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.27480000257492065},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.27090001106262207},{"id":"https://openalex.org/C49766605","wikidata":"https://www.wikidata.org/wiki/Q207643","display_name":"Linear map","level":2,"score":0.26980000734329224},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.2648000121116638},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.26339998841285706},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.2628999948501587},{"id":"https://openalex.org/C147925508","wikidata":"https://www.wikidata.org/wiki/Q970767","display_name":"Generalized eigenvector","level":5,"score":0.25679999589920044},{"id":"https://openalex.org/C2779172887","wikidata":"https://www.wikidata.org/wiki/Q184316","display_name":"PageRank","level":2,"score":0.2547999918460846},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.25440001487731934},{"id":"https://openalex.org/C33676613","wikidata":"https://www.wikidata.org/wiki/Q13415176","display_name":"Dimension (graph theory)","level":2,"score":0.25}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.00175","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00175","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.00175","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00175","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0,238],"quadratic":[1,248],"cost":[2],"of":[3,70,83,100,116,125,246],"softmax":[4,178],"attention":[5,23,43,103,109],"limits":[6],"Transformer":[7],"scalability":[8],"in":[9],"high-resolution":[10],"vision.":[11],"We":[12,60,88],"introduce":[13],"Infinite":[14],"Self-Attention":[15],"(InfSA),":[16],"a":[17,26,30,38,76,92,153,170],"spectral":[18],"reformulation":[19],"that":[20,95],"treats":[21],"each":[22],"layer":[24],"as":[25],"diffusion":[27],"step":[28],"on":[29,168],"content-adaptive":[31],"token":[32,58],"graph,":[33],"accumulating":[34],"multi-hop":[35],"interactions":[36],"through":[37],"discounted":[39],"Neumann":[40,64],"series":[41],"over":[42,175],"matrices.":[44],"This":[45],"links":[46],"self-attention":[47],"to":[48,120,194,230],"classical":[49],"graph":[50],"centrality":[51,78],"(Katz,":[52],"PageRank,":[53],"eigenvector":[54,99,245],"centrality)":[55],"for":[56],"interpretable":[57],"weighting.":[59],"also":[61],"show":[62],"the":[63,67,97,101,107,182,226,243,247],"kernel":[65],"equals":[66],"fundamental":[68],"matrix":[69],"an":[71,113,176,204],"absorbing":[72],"Markov":[73],"chain,":[74],"so":[75],"token's":[77],"is":[79,129,225],"its":[80],"expected":[81],"number":[82],"random-walk":[84],"visits":[85],"before":[86],"absorption.":[87],"then":[89],"propose":[90],"Linear-InfSA,":[91],"linear-time":[93],"variant":[94],"approximates":[96],"principal":[98],"implicit":[102],"operator":[104,249],"without":[105,236],"forming":[106],"full":[108],"matrix.":[110],"It":[111],"keeps":[112],"auxiliary":[114],"state":[115],"fixed":[117],"size":[118],"proportional":[119],"per-head":[121],"dimension":[122],"dh":[123],"(independent":[124],"sequence":[126],"length":[127],"N),":[128],"drop-in":[130],"compatible":[131],"with":[132,181],"Vision":[133],"Transformers,":[134],"and":[135,143,213,219,224],"supports":[136],"stable":[137],"training":[138],"at":[139,145,160,210],"4096":[140,142],"by":[141,147,162,233],"inference":[144,235],"9216":[146,148,232,234],"(about":[149],"332k":[150],"tokens).":[151],"In":[152],"4-layer":[154],"ViT":[155,179],"(53.5M":[156],"parameters,":[157],"59":[158],"GFLOPs":[159],"224":[161],"224),":[163],"Linear-InfSA":[164],"reaches":[165],"84.7%":[166],"top-1":[167],"ImageNet-1K,":[169],"+3.2":[171],"point":[172],"architectural":[173],"gain":[174],"equal-depth":[177,222],"trained":[180],"same":[183],"recipe.":[184],"On":[185,203],"ImageNet-V2,":[186],"InfViT":[187],"variants":[188],"outperform":[189],"all":[190],"compared":[191],"baselines":[192],"(up":[193],"79.8%":[195],"vs":[196],"76.8%),":[197],"indicating":[198],"robustness":[199],"under":[200],"distribution":[201],"shift.":[202],"A100":[205],"40GB":[206],"GPU,":[207],"Linear-InfViT":[208],"runs":[209],"231":[211],"images/s":[212],"0.87":[214],"J/image":[215],"(13x":[216],"better":[217],"throughput":[218],"energy":[220],"than":[221],"ViT)":[223],"only":[227],"tested":[228],"model":[229],"complete":[231],"out-of-memory.":[237],"linear":[239],"approximation":[240],"closely":[241],"matches":[242],"dominant":[244],"(cosine":[250],"0.985).":[251]},"counts_by_year":[],"updated_date":"2026-04-02T13:48:15.688549","created_date":"2026-03-04T00:00:00"}
