{"id":"https://openalex.org/W7134042529","doi":"https://doi.org/10.48550/arxiv.2603.05451","title":"FlashAttention-4: Algorithm and Kernel Pipelining Co-Design for Asymmetric Hardware Scaling","display_name":"FlashAttention-4: Algorithm and Kernel Pipelining Co-Design for Asymmetric Hardware Scaling","publication_year":2026,"publication_date":"2026-03-05","ids":{"openalex":"https://openalex.org/W7134042529","doi":"https://doi.org/10.48550/arxiv.2603.05451"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.05451","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128278344","display_name":"Ted Zadouri","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zadouri, Ted","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120319537","display_name":"Markus Hoehnerbach","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hoehnerbach, Markus","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128222924","display_name":"Jay Shah","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shah, Jay","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128240381","display_name":"Timmy Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Timmy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086615721","display_name":"Vijay Thakkar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Thakkar, Vijay","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128258856","display_name":"Tri Dao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dao, Tri","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5128278344"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.847000002861023,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.847000002861023,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.029400000348687172,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.01679999940097332,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5468999743461609},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5338000059127808},{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.5317999720573425},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5277000069618225},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4747999906539917},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.3456000089645386},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.3361999988555908},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.32670000195503235}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8338000178337097},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6449999809265137},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5468999743461609},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5338000059127808},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.5317999720573425},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5277000069618225},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4747999906539917},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.39169999957084656},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.3456000089645386},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.3361999988555908},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.32670000195503235},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.325300008058548},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.3165999948978424},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.30160000920295715},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.3003000020980835},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.288100004196167},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.2833999991416931},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.2757999897003174},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.2757999897003174},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.2750999927520752},{"id":"https://openalex.org/C151376022","wikidata":"https://www.wikidata.org/wiki/Q168698","display_name":"Exponential function","level":2,"score":0.2648000121116638},{"id":"https://openalex.org/C76399640","wikidata":"https://www.wikidata.org/wiki/Q189401","display_name":"Virtual memory","level":4,"score":0.26179999113082886},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.26170000433921814},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.26010000705718994},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.25760000944137573},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.25679999589920044},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.2563999891281128}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.05451","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.05451","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.05451","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.05451","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.5896758437156677,"display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Attention,":[0],"as":[1,50],"a":[2],"core":[3,67],"layer":[4],"of":[5],"the":[6,11,36,51,127,140],"ubiquitous":[7],"Transformer":[8],"architecture,":[9],"is":[10],"bottleneck":[12],"for":[13,24],"large":[14],"language":[15],"models":[16],"and":[17,30,53,106,113,121,126,136,157],"long-context":[18],"applications.":[19],"While":[20],"FlashAttention-3":[21],"optimized":[22],"attention":[23],"Hopper":[25],"GPUs":[26,163],"through":[27],"asynchronous":[28,103],"execution":[29],"warp":[31],"specialization,":[32],"it":[33],"primarily":[34],"targets":[35],"H100":[37],"architecture.":[38],"The":[39],"AI":[40],"industry":[41],"has":[42],"rapidly":[43],"transitioned":[44],"to":[45,62,89,131,151,168,191],"deploying":[46],"Blackwell-based":[47],"systems":[48],"such":[49],"B200":[52,162],"GB200,":[54],"which":[55],"exhibit":[56],"fundamentally":[57],"different":[58],"performance":[59],"characteristics":[60],"due":[61],"asymmetric":[63],"hardware":[64],"scaling:":[65],"tensor":[66,124],"throughput":[68],"doubles":[69],"while":[70,196],"other":[71],"functional":[72],"units":[73],"(shared":[74],"memory":[75,125,134],"bandwidth,":[76],"exponential":[77,112],"units)":[78],"scale":[79],"more":[80],"slowly":[81],"or":[82],"remain":[83],"unchanged.":[84],"We":[85,143],"develop":[86],"several":[87],"techniques":[88],"address":[90],"these":[91],"shifting":[92],"bottlenecks":[93],"on":[94,161],"Blackwell":[95],"GPUs:":[96],"(1)":[97],"redesigned":[98],"pipelines":[99],"that":[100,117,145],"exploit":[101],"fully":[102],"MMA":[104,129],"operations":[105],"larger":[107],"tile":[108],"sizes,":[109],"(2)":[110],"software-emulated":[111],"conditional":[114],"softmax":[115],"rescaling":[116],"reduces":[118],"non-matmul":[119],"operations,":[120],"(3)":[122],"leveraging":[123],"2-CTA":[128],"mode":[130],"reduce":[132],"shared":[133],"traffic":[135],"atomic":[137],"adds":[138],"in":[139,180,183],"backward":[141],"pass.":[142],"demonstrate":[144],"our":[146],"method,":[147],"FlashAttention-4,":[148],"achieves":[149],"up":[150,167],"1.3$\\times$":[152],"speedup":[153],"over":[154,159],"cuDNN":[155],"9.13":[156],"2.7$\\times$":[158],"Triton":[160],"with":[164],"BF16,":[165],"reaching":[166],"1613":[169],"TFLOPs/s":[170],"(71%":[171],"utilization).":[172],"Beyond":[173],"algorithmic":[174],"innovations,":[175],"we":[176],"implement":[177],"FlashAttention-4":[178],"entirely":[179],"CuTe-DSL":[181],"embedded":[182],"Python,":[184],"achieving":[185],"20-30$\\times$":[186],"faster":[187],"compile":[188],"times":[189],"compared":[190],"traditional":[192],"C++":[193],"template-based":[194],"approaches":[195],"maintaining":[197],"full":[198],"expressivity.":[199]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-03-07T00:00:00"}
