{"id":"https://openalex.org/W7148852938","doi":"https://doi.org/10.48550/arxiv.2604.01757","title":"Attention Mechanisms Through the Lens of Numerical Methods: Approximation Methods and Alternative Formulations","display_name":"Attention Mechanisms Through the Lens of Numerical Methods: Approximation Methods and Alternative Formulations","publication_year":2026,"publication_date":"2026-04-02","ids":{"openalex":"https://openalex.org/W7148852938","doi":"https://doi.org/10.48550/arxiv.2604.01757"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.01757","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.01757","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.01757","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5008454935","display_name":"Michel Fabrice Serret","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Serret, Michel Fabrice","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080876002","display_name":"Alice Cortinovis","orcid":"https://orcid.org/0000-0001-6917-5106"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cortinovis, Alice","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102200938","display_name":"Yijun Dong","orcid":"https://orcid.org/0000-0002-3230-7459"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Yijun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073977925","display_name":"Diana Halikias","orcid":"https://orcid.org/0000-0001-7159-7770"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Halikias, Diana","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132916092","display_name":"Anna Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Anna","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132895129","display_name":"Fabio Matti","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Matti, Fabio","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132908383","display_name":"Deanna Needell","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Needell, Deanna","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084528554","display_name":"Katherine J. Pearce","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pearce, Katherine J.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054763583","display_name":"Elizaveta Rebrova","orcid":"https://orcid.org/0000-0002-4041-4238"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rebrova, Elizaveta","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055681172","display_name":"Disha Shur","orcid":"https://orcid.org/0000-0003-4556-0071"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shur, Disha","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132864658","display_name":"Rudi Smith","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Smith, Rudi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128122617","display_name":"Hai-Xiao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Hai-Xiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132831205","display_name":"Laura Grigori","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Grigori, Laura","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.21250000596046448,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.21250000596046448,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.13359999656677246,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11206","display_name":"Model Reduction and Neural Networks","score":0.09009999781847,"subfield":{"id":"https://openalex.org/subfields/3109","display_name":"Statistical and Nonlinear Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/quadratic-equation","display_name":"Quadratic equation","score":0.5289999842643738},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5080000162124634},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.49070000648498535},{"id":"https://openalex.org/keywords/numerical-analysis","display_name":"Numerical analysis","score":0.4490000009536743},{"id":"https://openalex.org/keywords/through-the-lens-metering","display_name":"Through-the-lens metering","score":0.4350999891757965},{"id":"https://openalex.org/keywords/softmax-function","display_name":"Softmax function","score":0.4115999937057495},{"id":"https://openalex.org/keywords/computational-complexity-theory","display_name":"Computational complexity theory","score":0.34389999508857727},{"id":"https://openalex.org/keywords/numerical-linear-algebra","display_name":"Numerical linear algebra","score":0.3434000015258789},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.3296000063419342}],"concepts":[{"id":"https://openalex.org/C129844170","wikidata":"https://www.wikidata.org/wiki/Q41299","display_name":"Quadratic equation","level":2,"score":0.5289999842643738},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5141000151634216},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5080000162124634},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.49070000648498535},{"id":"https://openalex.org/C48753275","wikidata":"https://www.wikidata.org/wiki/Q11216","display_name":"Numerical analysis","level":2,"score":0.4490000009536743},{"id":"https://openalex.org/C43091099","wikidata":"https://www.wikidata.org/wiki/Q1067788","display_name":"Through-the-lens metering","level":3,"score":0.4350999891757965},{"id":"https://openalex.org/C188441871","wikidata":"https://www.wikidata.org/wiki/Q7554146","display_name":"Softmax function","level":3,"score":0.4115999937057495},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.36820000410079956},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.35030001401901245},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.34389999508857727},{"id":"https://openalex.org/C163834973","wikidata":"https://www.wikidata.org/wiki/Q2004891","display_name":"Numerical linear algebra","level":3,"score":0.3434000015258789},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.3296000063419342},{"id":"https://openalex.org/C148764684","wikidata":"https://www.wikidata.org/wiki/Q621751","display_name":"Approximation algorithm","level":2,"score":0.32839998602867126},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.3264999985694885},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.3181000053882599},{"id":"https://openalex.org/C57493831","wikidata":"https://www.wikidata.org/wiki/Q3134666","display_name":"Projection (relational algebra)","level":2,"score":0.31619998812675476},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.30070000886917114},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2969000041484833},{"id":"https://openalex.org/C5274069","wikidata":"https://www.wikidata.org/wiki/Q2285707","display_name":"Categorical variable","level":2,"score":0.28839999437332153},{"id":"https://openalex.org/C2986878899","wikidata":"https://www.wikidata.org/wiki/Q11216","display_name":"Numerical approximation","level":3,"score":0.28690001368522644},{"id":"https://openalex.org/C15336307","wikidata":"https://www.wikidata.org/wiki/Q1766051","display_name":"Lens (geology)","level":2,"score":0.28139999508857727},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.2784000039100647},{"id":"https://openalex.org/C500300565","wikidata":"https://www.wikidata.org/wiki/Q925667","display_name":"Computer simulation","level":2,"score":0.27790001034736633},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.271699994802475},{"id":"https://openalex.org/C12362212","wikidata":"https://www.wikidata.org/wiki/Q728435","display_name":"Linear subspace","level":2,"score":0.26649999618530273},{"id":"https://openalex.org/C81081738","wikidata":"https://www.wikidata.org/wiki/Q55542","display_name":"Lossless compression","level":3,"score":0.26420000195503235},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.2628999948501587},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.2612000107765198},{"id":"https://openalex.org/C176321772","wikidata":"https://www.wikidata.org/wiki/Q1430640","display_name":"Numerical stability","level":3,"score":0.25290000438690186},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.25099998712539673}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.01757","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.01757","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.01757","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.01757","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"attention":[1,38,48,113,167],"mechanism":[2],"is":[3,20,70],"the":[4,16,21,51,85,124,145,163],"computational":[5,156],"core":[6],"of":[7,33,53,112,165],"modern":[8],"Transformer":[9],"architectures,":[10],"but":[11],"its":[12],"quadratic":[13],"complexity":[14],"in":[15],"input":[17],"sequence":[18],"length":[19],"bottleneck":[22],"for":[23,152],"large-scale":[24],"inference.":[25],"This":[26],"has":[27],"motivated":[28],"a":[29,57,137],"rapidly":[30],"growing":[31],"body":[32],"work":[34],"aimed":[35],"at":[36],"accelerating":[37],"through":[39,50],"approximation":[40,81],"and":[41,62,78,93,97,104,114,149],"reformulation.":[42],"In":[43],"this":[44],"survey,":[45],"we":[46,73,141],"revisit":[47],"mechanisms":[49],"lens":[52],"numerical":[54,65,86,159],"analysis,":[55],"with":[56],"particular":[58],"emphasis":[59],"on":[60],"tools":[61],"perspectives":[63],"from":[64,155],"linear":[66,160],"algebra.":[67],"Our":[68],"goal":[69],"twofold:":[71],"first,":[72],"aim":[74,142],"to":[75,84,128,143,162],"systematically":[76],"review":[77],"classify":[79],"fast":[80],"methods":[82],"according":[83],"principles":[87],"they":[88],"exploit.":[89],"These":[90],"include":[91],"sparsity":[92],"clustering":[94],"approaches,":[95],"low-rank":[96],"subspace":[98],"projection":[99],"techniques,":[100],"randomized":[101],"sketching":[102],"methods,":[103],"tensor-based":[105],"decompositions.":[106],"We":[107],"also":[108],"discuss":[109],"kernel-inspired":[110],"reformulations":[111],"recent":[115],"architectural":[116],"variants,":[117],"such":[118],"as":[119],"Latent":[120],"Attention,":[121],"that":[122],"modify":[123],"standard":[125],"softmax":[126],"formulation":[127],"improve":[129],"efficiency.":[130],"Second,":[131],"by":[132],"presenting":[133],"these":[134],"developments":[135],"within":[136],"unified":[138],"mathematical":[139],"framework,":[140],"bridge":[144],"gap":[146],"between":[147],"disciplines":[148],"highlight":[150],"opportunities":[151],"further":[153],"contributions":[154],"mathematics,":[157],"particularly":[158],"algebra,":[161],"design":[164],"scalable":[166],"mechanisms.":[168]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-04T00:00:00"}
