{"id":"https://openalex.org/W4413755464","doi":"https://doi.org/10.1109/isvlsi65124.2025.11130221","title":"FlatAttention: Dataflow and Fabric Collectives Co-Optimization for Efficient Multi-Head Attention on Tile-Based Many-PE Accelerators","display_name":"FlatAttention: Dataflow and Fabric Collectives Co-Optimization for Efficient Multi-Head Attention on Tile-Based Many-PE Accelerators","publication_year":2025,"publication_date":"2025-07-06","ids":{"openalex":"https://openalex.org/W4413755464","doi":"https://doi.org/10.1109/isvlsi65124.2025.11130221"},"language":"en","primary_location":{"id":"doi:10.1109/isvlsi65124.2025.11130221","is_oa":false,"landing_page_url":"https://doi.org/10.1109/isvlsi65124.2025.11130221","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Computer Society Annual Symposium on VLSI (ISVLSI)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100458165","display_name":"Chi Zhang","orcid":"https://orcid.org/0000-0002-2503-857X"},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":true,"raw_author_name":"Chi Zhang","raw_affiliation_strings":["Integrated Systems Laboratory (IIS), ETH Zurich,Zurich,Switzerland"],"affiliations":[{"raw_affiliation_string":"Integrated Systems Laboratory (IIS), ETH Zurich,Zurich,Switzerland","institution_ids":["https://openalex.org/I35440088"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5094583316","display_name":"Luca Colagrande","orcid":"https://orcid.org/0000-0002-7986-1975"},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Luca Colagrande","raw_affiliation_strings":["Integrated Systems Laboratory (IIS), ETH Zurich,Zurich,Switzerland"],"affiliations":[{"raw_affiliation_string":"Integrated Systems Laboratory (IIS), ETH Zurich,Zurich,Switzerland","institution_ids":["https://openalex.org/I35440088"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050957615","display_name":"Renzo Andri","orcid":"https://orcid.org/0000-0002-8776-5158"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Renzo Andri","raw_affiliation_strings":["Computing Systems Lab, Huawei Zurich Research Center,Zurich,Switzerland"],"affiliations":[{"raw_affiliation_string":"Computing Systems Lab, Huawei Zurich Research Center,Zurich,Switzerland","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085591358","display_name":"Thomas Benz","orcid":"https://orcid.org/0000-0002-0326-9676"},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Thomas Benz","raw_affiliation_strings":["Integrated Systems Laboratory (IIS), ETH Zurich,Zurich,Switzerland"],"affiliations":[{"raw_affiliation_string":"Integrated Systems Laboratory (IIS), ETH Zurich,Zurich,Switzerland","institution_ids":["https://openalex.org/I35440088"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073926165","display_name":"Gamze \u0130slamo\u011flu","orcid":"https://orcid.org/0000-0002-5129-1691"},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Gamze Islamoglu","raw_affiliation_strings":["Integrated Systems Laboratory (IIS), ETH Zurich,Zurich,Switzerland"],"affiliations":[{"raw_affiliation_string":"Integrated Systems Laboratory (IIS), ETH Zurich,Zurich,Switzerland","institution_ids":["https://openalex.org/I35440088"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054770753","display_name":"Alessandro Nadalini","orcid":"https://orcid.org/0009-0007-3574-7576"},"institutions":[{"id":"https://openalex.org/I9360294","display_name":"University of Bologna","ror":"https://ror.org/01111rn36","country_code":"IT","type":"education","lineage":["https://openalex.org/I9360294"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Alessandro Nadalini","raw_affiliation_strings":["University of Bologna,Department of Electrical, Electronic, and Information Engineering (DEI),Bologna,Italy"],"affiliations":[{"raw_affiliation_string":"University of Bologna,Department of Electrical, Electronic, and Information Engineering (DEI),Bologna,Italy","institution_ids":["https://openalex.org/I9360294"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038717922","display_name":"Francesco Conti","orcid":"https://orcid.org/0000-0002-7924-933X"},"institutions":[{"id":"https://openalex.org/I9360294","display_name":"University of Bologna","ror":"https://ror.org/01111rn36","country_code":"IT","type":"education","lineage":["https://openalex.org/I9360294"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Francesco Conti","raw_affiliation_strings":["University of Bologna,Department of Electrical, Electronic, and Information Engineering (DEI),Bologna,Italy"],"affiliations":[{"raw_affiliation_string":"University of Bologna,Department of Electrical, Electronic, and Information Engineering (DEI),Bologna,Italy","institution_ids":["https://openalex.org/I9360294"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100377386","display_name":"Yawei Li","orcid":"https://orcid.org/0000-0002-8948-7892"},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Yawei Li","raw_affiliation_strings":["Integrated Systems Laboratory (IIS), ETH Zurich,Zurich,Switzerland"],"affiliations":[{"raw_affiliation_string":"Integrated Systems Laboratory (IIS), ETH Zurich,Zurich,Switzerland","institution_ids":["https://openalex.org/I35440088"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043408422","display_name":"Luca Benini","orcid":"https://orcid.org/0000-0001-8068-3806"},"institutions":[{"id":"https://openalex.org/I9360294","display_name":"University of Bologna","ror":"https://ror.org/01111rn36","country_code":"IT","type":"education","lineage":["https://openalex.org/I9360294"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Luca Benini","raw_affiliation_strings":["University of Bologna,Department of Electrical, Electronic, and Information Engineering (DEI),Bologna,Italy"],"affiliations":[{"raw_affiliation_string":"University of Bologna,Department of Electrical, Electronic, and Information Engineering (DEI),Bologna,Italy","institution_ids":["https://openalex.org/I9360294"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5100458165"],"corresponding_institution_ids":["https://openalex.org/I35440088"],"apc_list":null,"apc_paid":null,"fwci":1.6429,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.86342583,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.9830999970436096,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.9830999970436096,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11992","display_name":"CCD and CMOS Imaging Sensors","score":0.9760000109672546,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13182","display_name":"Quantum-Dot Cellular Automata","score":0.9728999733924866,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/dataflow","display_name":"Dataflow","score":0.9205396175384521},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7095852494239807},{"id":"https://openalex.org/keywords/tile","display_name":"Tile","score":0.6649600267410278},{"id":"https://openalex.org/keywords/head","display_name":"Head (geology)","score":0.530346691608429},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.4377307891845703},{"id":"https://openalex.org/keywords/dataflow-architecture","display_name":"Dataflow architecture","score":0.4200001060962677},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.35703253746032715},{"id":"https://openalex.org/keywords/materials-science","display_name":"Materials science","score":0.23649871349334717},{"id":"https://openalex.org/keywords/composite-material","display_name":"Composite material","score":0.09327149391174316},{"id":"https://openalex.org/keywords/geology","display_name":"Geology","score":0.09029746055603027}],"concepts":[{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.9205396175384521},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7095852494239807},{"id":"https://openalex.org/C2780728851","wikidata":"https://www.wikidata.org/wiki/Q468402","display_name":"Tile","level":2,"score":0.6649600267410278},{"id":"https://openalex.org/C2780312720","wikidata":"https://www.wikidata.org/wiki/Q5689100","display_name":"Head (geology)","level":2,"score":0.530346691608429},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4377307891845703},{"id":"https://openalex.org/C176727019","wikidata":"https://www.wikidata.org/wiki/Q1172415","display_name":"Dataflow architecture","level":3,"score":0.4200001060962677},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.35703253746032715},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.23649871349334717},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.09327149391174316},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.09029746055603027},{"id":"https://openalex.org/C114793014","wikidata":"https://www.wikidata.org/wiki/Q52109","display_name":"Geomorphology","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/isvlsi65124.2025.11130221","is_oa":false,"landing_page_url":"https://doi.org/10.1109/isvlsi65124.2025.11130221","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Computer Society Annual Symposium on VLSI (ISVLSI)","raw_type":"proceedings-article"},{"id":"pmh:oai:cris.unibo.it:11585/1040837","is_oa":false,"landing_page_url":"https://hdl.handle.net/11585/1040837","pdf_url":null,"source":{"id":"https://openalex.org/S4306402579","display_name":"Archivio istituzionale della ricerca (Alma Mater Studiorum Universit\u00e0 di Bologna)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210117483","host_organization_name":"Istituto di Ematologia di Bologna","host_organization_lineage":["https://openalex.org/I4210117483"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"info:eu-repo/semantics/conferenceObject"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2564598376","https://openalex.org/W1484403103","https://openalex.org/W2584408851","https://openalex.org/W2115158825","https://openalex.org/W2101960124","https://openalex.org/W4377693460","https://openalex.org/W2033683327","https://openalex.org/W2783505431","https://openalex.org/W2521947294","https://openalex.org/W4236419692"],"abstract_inverted_index":{"Multi-Head":[0],"Attention":[1],"(MHA)":[2],"is":[3,31],"a":[4,44,103,109,163],"critical":[5],"computational":[6],"kernel":[7],"in":[8,130,167],"transformer-based":[9],"AI":[10],"models.":[11],"Emerging":[12],"scalable":[13],"tile-based":[14,50,85,106,148],"accelerator":[15,107,149],"architectures":[16],"integrate":[17],"increasing":[18],"numbers":[19],"of":[20,37],"tightly-packed":[21],"processing":[22],"elements":[23],"(PEs)":[24],"with":[25,115],"tensor":[26],"units.":[27,40],"MHA":[28,48],"dataflow":[29,46,83],"mapping":[30],"crucial":[32],"for":[33,47,102],"achieving":[34],"high":[35],"utilization":[36,139],"the":[38,65,124,143,159,172],"available":[39],"We":[41],"propose":[42],"FlatAttention,":[43],"new":[45],"on":[49,84,142,171],"many-PE":[51],"accelerators,":[52],"minimizing":[53],"costly":[54],"main":[55],"memory":[56],"(HBM)":[57],"accesses":[58],"by":[59,91],"leveraging":[60],"collective":[61],"primitives":[62],"integrated":[63],"into":[64],"on-chip":[66],"network":[67],"fabric.":[68],"FlatAttention":[69,129],"achieves":[70,133],"up":[71,134],"to":[72,123,135,158],"$89.3":[73],"\\%$":[74,153],"utilization,":[75],"and":[76],"$4.1":[77],"\\times$":[78,137,165],"performance":[79],"speedup":[80],"over":[81,140],"FlashAttention-3":[82],"accelerators":[86],"whilst":[87],"reducing":[88],"HBM":[89,155],"traffic":[90],"$16":[92],"\\times$.":[93],"Through":[94],"algorithm-architecture":[95],"co-exploration,":[96],"we":[97],"identify":[98],"an":[99],"optimal":[100],"configuration":[101,132,150],"large":[104],"scaled-out":[105],"featuring":[108],"$32":[110],"\\times":[111],"32$":[112],"tile":[113],"mesh":[114],"1024":[116],"TFLOPS":[117],"@":[118],"FP16":[119],"peak":[120],"performance,":[121],"comparable":[122],"state-of-the-art":[125],"Nvidia":[126],"H100":[127,144,160],"GPU.":[128,145],"this":[131,147],"$1.3":[136],"higher":[138],"FlashAttention3":[141],"Meanwhile,":[146],"requires":[151],"$40":[152],"less":[154],"bandwidth":[156],"compared":[157],"GPU,":[161],"enabling":[162],"$1.8":[164],"reduction":[166],"die":[168],"size,":[169],"estimated":[170],"same":[173],"technology":[174],"node.":[175]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2026-03-06T13:50:29.536080","created_date":"2025-10-10T00:00:00"}
