{"id":"https://openalex.org/W4398156940","doi":"https://doi.org/10.48550/arxiv.2405.10480","title":"Lean Attention: Hardware-Aware Scalable Attention Mechanism for the Decode-Phase of Transformers","display_name":"Lean Attention: Hardware-Aware Scalable Attention Mechanism for the Decode-Phase of Transformers","publication_year":2024,"publication_date":"2024-05-17","ids":{"openalex":"https://openalex.org/W4398156940","doi":"https://doi.org/10.48550/arxiv.2405.10480"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2405.10480","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.10480","pdf_url":"https://arxiv.org/pdf/2405.10480","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2405.10480","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5098717251","display_name":"Rya Sanovar","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sanovar, Rya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089179434","display_name":"Srikant Bharadwaj","orcid":"https://orcid.org/0000-0002-0422-5210"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bharadwaj, Srikant","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068951662","display_name":"Ren\u00e9e St. Amant","orcid":"https://orcid.org/0009-0006-9387-5886"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Amant, Renee St.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049145726","display_name":"Victor R\u00fchle","orcid":"https://orcid.org/0000-0002-8957-7628"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"R\u00fchle, Victor","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5070722259","display_name":"Saravan Rajmohan","orcid":"https://orcid.org/0000-0002-2019-213X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rajmohan, Saravan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5098717251"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11159","display_name":"Manufacturing Process and Optimization","score":0.8363999724388123,"subfield":{"id":"https://openalex.org/subfields/2209","display_name":"Industrial and Manufacturing Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11159","display_name":"Manufacturing Process and Optimization","score":0.8363999724388123,"subfield":{"id":"https://openalex.org/subfields/2209","display_name":"Industrial and Manufacturing Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12111","display_name":"Industrial Vision Systems and Defect Detection","score":0.7601000070571899,"subfield":{"id":"https://openalex.org/subfields/2209","display_name":"Industrial and Manufacturing Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6374706029891968},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6005585789680481},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.5399269461631775},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5390979051589966},{"id":"https://openalex.org/keywords/computer-hardware","display_name":"Computer hardware","score":0.3868926167488098},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.36418378353118896},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.15139424800872803},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.14258843660354614},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.10119935870170593},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.07207274436950684},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.03609892725944519}],"concepts":[{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6374706029891968},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6005585789680481},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.5399269461631775},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5390979051589966},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.3868926167488098},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.36418378353118896},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.15139424800872803},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.14258843660354614},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.10119935870170593},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.07207274436950684},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.03609892725944519},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2405.10480","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.10480","pdf_url":"https://arxiv.org/pdf/2405.10480","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2405.10480","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2405.10480","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2405.10480","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.10480","pdf_url":"https://arxiv.org/pdf/2405.10480","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4398156940.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2389214306","https://openalex.org/W2382997850","https://openalex.org/W2390968135","https://openalex.org/W4235240664","https://openalex.org/W2965083567","https://openalex.org/W1838576100","https://openalex.org/W2095886385","https://openalex.org/W2889616422","https://openalex.org/W2089704382","https://openalex.org/W1983399550"],"abstract_inverted_index":{"Transformer-based":[0],"models":[1,27,37],"have":[2,89],"emerged":[3],"as":[4,52,82,177],"one":[5],"of":[6,24,33,60,68,98,116,130,138,153,171,200,213],"the":[7,25,55,61,69,94,112,134,145,150,159,163,168,186,196],"most":[8],"widely":[9],"used":[10],"architectures":[11],"for":[12,133,149,162,225],"natural":[13,16],"language":[14,17],"processing,":[15],"generation,":[18],"and":[19,41,57,75,86,220],"image":[20],"generation.":[21],"The":[22],"size":[23],"state-of-the-art":[26],"has":[28],"increased":[29],"steadily":[30],"reaching":[31],"billions":[32],"parameters.":[34],"These":[35],"huge":[36],"are":[38],"memory":[39,58],"hungry":[40],"incur":[42],"significant":[43],"inference":[44],"latency":[45,96],"even":[46],"on":[47,101],"cutting":[48],"edge":[49],"AI-accelerators,":[50],"such":[51,81,102],"GPUs.":[53],"Specifically,":[54],"time":[56],"complexity":[59],"attention":[62,146,187,215],"operation":[63,180],"is":[64],"quadratic":[65],"in":[66,210],"terms":[67],"total":[70],"context":[71,155,192,227],"length,":[72],"i.e.,":[73],"prompt":[74],"output":[76],"tokens.":[77],"Thus,":[78],"several":[79],"optimizations":[80],"key-value":[83],"tensor":[84],"caching":[85],"FlashAttention":[87],"computation":[88,188,208],"been":[90],"proposed":[91],"to":[92,111,184,203,205,222],"deliver":[93],"low":[95],"demands":[97],"applications":[99],"relying":[100],"large":[103,191],"models.":[104,141],"However,":[105],"these":[106,190],"techniques":[107],"do":[108],"not":[109],"cater":[110],"computationally":[113],"distinct":[114],"nature":[115],"different":[117],"phases":[118],"during":[119],"inference.":[120],"To":[121],"that":[122,167],"end,":[123],"we":[124],"propose":[125],"LeanAttention,":[126],"a":[127,178],"scalable":[128],"technique":[129],"computing":[131],"self-attention":[132,204],"token-generation":[135],"phase":[136],"(decode-phase)":[137],"decoder-only":[139],"transformer":[140],"LeanAttention":[142],"enables":[143],"scaling":[144],"mechanism":[147],"implementation":[148],"challenging":[151],"case":[152],"long":[154],"lengths":[156],"by":[157],"re-designing":[158],"execution":[160,216],"flow":[161],"decode-phase.":[164],"We":[165,194],"identify":[166],"associative":[169],"property":[170],"online":[172],"softmax":[173],"can":[174],"be":[175],"treated":[176],"reduction":[179,199],"thus":[181],"allowing":[182],"us":[183],"parallelize":[185],"over":[189,218],"lengths.":[193,228],"extend":[195],"\"stream-K\"":[197],"style":[198],"tiled":[201],"calculation":[202],"enable":[206],"parallel":[207],"resulting":[209],"an":[211],"average":[212],"2.6x":[214],"speedup":[217,224],"FlashAttention-2":[219],"up":[221],"8.33x":[223],"512k":[226]},"counts_by_year":[],"updated_date":"2026-03-11T14:59:36.786465","created_date":"2025-10-10T00:00:00"}
