{"id":"https://openalex.org/W4392650608","doi":"https://doi.org/10.1007/s11227-024-05890-8","title":"SWattention: designing fast and memory-efficient attention for a new Sunway Supercomputer","display_name":"SWattention: designing fast and memory-efficient attention for a new Sunway Supercomputer","publication_year":2024,"publication_date":"2024-03-11","ids":{"openalex":"https://openalex.org/W4392650608","doi":"https://doi.org/10.1007/s11227-024-05890-8"},"language":"en","primary_location":{"id":"doi:10.1007/s11227-024-05890-8","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11227-024-05890-8","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11227-024-05890-8.pdf","source":{"id":"https://openalex.org/S32326811","display_name":"The Journal of Supercomputing","issn_l":"0920-8542","issn":["0920-8542","1573-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The Journal of Supercomputing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/s11227-024-05890-8.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010615486","display_name":"Ruohan Wu","orcid":"https://orcid.org/0000-0001-5514-224X"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Ruohan Wu","raw_affiliation_strings":["School of Computer Science and Technology, University of Science and Technology of China, Hefei, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102627707","display_name":"Xianyu Zhu","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xianyu Zhu","raw_affiliation_strings":["School of Computer Science and Technology, University of Science and Technology of China, Hefei, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101436158","display_name":"Junshi Chen","orcid":"https://orcid.org/0000-0002-6487-3658"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junshi Chen","raw_affiliation_strings":["School of Computer Science and Technology, University of Science and Technology of China, Hefei, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100402709","display_name":"Sha Liu","orcid":"https://orcid.org/0000-0002-7168-330X"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sha Liu","raw_affiliation_strings":["School of Computer Science and Technology, University of Science and Technology of China, Hefei, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013845626","display_name":"Tianyu Zheng","orcid":"https://orcid.org/0000-0001-9030-9957"},"institutions":[{"id":"https://openalex.org/I4210123185","display_name":"Zhejiang Lab","ror":"https://ror.org/02m2h7991","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210123185"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tianyu Zheng","raw_affiliation_strings":["Zhejiang Lab, Hangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Zhejiang Lab, Hangzhou, China","institution_ids":["https://openalex.org/I4210123185"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100352268","display_name":"Xin Liu","orcid":"https://orcid.org/0000-0002-7870-6535"},"institutions":[{"id":"https://openalex.org/I4210158984","display_name":"National Supercomputing Center in Wuxi","ror":"https://ror.org/04ypjrs34","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210158984"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xin Liu","raw_affiliation_strings":["National Supercomputing Center in Wuxi, Wuxi, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Supercomputing Center in Wuxi, Wuxi, China","institution_ids":["https://openalex.org/I4210158984"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5085758579","display_name":"Hong An","orcid":"https://orcid.org/0000-0002-3900-3722"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hong An","raw_affiliation_strings":["School of Computer Science and Technology, University of Science and Technology of China, Hefei, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5010615486"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":{"value":2390,"currency":"EUR","value_usd":2990},"apc_paid":{"value":2390,"currency":"EUR","value_usd":2990},"fwci":0.9161,"has_fulltext":true,"cited_by_count":2,"citation_normalized_percentile":{"value":0.69488997,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":"80","issue":"10","first_page":"13657","last_page":"13680"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.991599977016449,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12326","display_name":"Network Packet Processing and Optimization","score":0.9904000163078308,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.896490216255188},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8884597420692444},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.8107335567474365},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.7749104499816895},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.5603621006011963},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5073739886283875},{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.5000040531158447},{"id":"https://openalex.org/keywords/auxiliary-memory","display_name":"Auxiliary memory","score":0.4528055191040039},{"id":"https://openalex.org/keywords/high-memory","display_name":"High memory","score":0.4122486710548401},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.30198949575424194},{"id":"https://openalex.org/keywords/computer-hardware","display_name":"Computer hardware","score":0.2940506339073181}],"concepts":[{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.896490216255188},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8884597420692444},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.8107335567474365},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.7749104499816895},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.5603621006011963},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5073739886283875},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.5000040531158447},{"id":"https://openalex.org/C82687282","wikidata":"https://www.wikidata.org/wiki/Q66221","display_name":"Auxiliary memory","level":2,"score":0.4528055191040039},{"id":"https://openalex.org/C2781357197","wikidata":"https://www.wikidata.org/wiki/Q5757597","display_name":"High memory","level":2,"score":0.4122486710548401},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.30198949575424194},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.2940506339073181},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/s11227-024-05890-8","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11227-024-05890-8","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11227-024-05890-8.pdf","source":{"id":"https://openalex.org/S32326811","display_name":"The Journal of Supercomputing","issn_l":"0920-8542","issn":["0920-8542","1573-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The Journal of Supercomputing","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1007/s11227-024-05890-8","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11227-024-05890-8","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11227-024-05890-8.pdf","source":{"id":"https://openalex.org/S32326811","display_name":"The Journal of Supercomputing","issn_l":"0920-8542","issn":["0920-8542","1573-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The Journal of Supercomputing","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G8999042641","display_name":null,"funder_award_id":"62102389","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320321133","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35"},{"id":"https://openalex.org/F4320325599","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4392650608.pdf"},"referenced_works_count":17,"referenced_works":["https://openalex.org/W1981276685","https://openalex.org/W2002555321","https://openalex.org/W2921898934","https://openalex.org/W3004659153","https://openalex.org/W3081168214","https://openalex.org/W3128098196","https://openalex.org/W3130554079","https://openalex.org/W3131922516","https://openalex.org/W3204462148","https://openalex.org/W3204998121","https://openalex.org/W4220838824","https://openalex.org/W4226094436","https://openalex.org/W4229447288","https://openalex.org/W4384705355","https://openalex.org/W6600445788","https://openalex.org/W6629340653","https://openalex.org/W6702248584"],"related_works":["https://openalex.org/W4376647684","https://openalex.org/W2089136983","https://openalex.org/W2398743891","https://openalex.org/W1580607742","https://openalex.org/W4243474409","https://openalex.org/W2058636218","https://openalex.org/W2993780376","https://openalex.org/W2607256436","https://openalex.org/W2289732183","https://openalex.org/W4385819989"],"abstract_inverted_index":{"Abstract":[0],"In":[1],"the":[2,13,24,28,46,50,60,65,79,83,89,100,137,174],"past":[3],"few":[4],"years,":[5],"Transformer-based":[6],"large":[7],"language":[8],"models":[9],"(LLM)":[10],"have":[11],"become":[12],"dominant":[14],"technology":[15],"in":[16],"a":[17,73,104,124],"series":[18],"of":[19,27,170],"applications.":[20],"To":[21,86],"scale":[22,163],"up":[23,164,179],"sequence":[25,155,193],"length":[26,194],"Transformer,":[29],"FlashAttention":[30,47],"is":[31,113,127],"proposed":[32],"to":[33,115,129,160,165,180],"compute":[34],"exact":[35,80],"attention":[36,81],"with":[37,121,136],"reduced":[38],"memory":[39,67,111,118],"requirements":[40],"and":[41,64,94,148,162],"faster":[42],"execution.":[43],"However,":[44],"implementing":[45],"algorithm":[48],"on":[49,82,99],"new":[51],"generation":[52],"Sunway":[53],"Supercomputer":[54],"faces":[55],"many":[56],"constraints":[57],"such":[58],"as":[59],"unique":[61],"heterogeneous":[62],"architecture":[63],"limited":[66],"bandwidth.":[68],"This":[69],"work":[70],"proposes":[71],"SWattention,":[72],"highly":[74],"efficient":[75],"method":[76],"for":[77,145,151,173,183,195],"computing":[78],"SW26010pro":[84],"processor.":[85],"fully":[87],"utilize":[88],"6":[90],"core":[91],"groups":[92],"(CG)":[93],"64":[95],"cores":[96],"per":[97],"CG":[98],"processor,":[101],"we":[102],"design":[103],"two-level":[105],"parallel":[106],"task":[107],"partition":[108],"strategy.":[109],"Asynchronous":[110],"access":[112,119],"employed":[114],"ensure":[116],"that":[117,189],"overlaps":[120],"computation.":[122],"Additionally,":[123],"tiling":[125],"strategy":[126],"introduced":[128],"determine":[130],"optimal":[131],"SRAM":[132],"block":[133],"sizes.":[134],"Compared":[135],"standard":[138],"attention,":[139],"SWattention":[140,177,190],"achieves":[141,178],"around":[142],"2.0x":[143],"speedup":[144,150,182],"FP32":[146],"training":[147,184],"2.5x":[149],"mixed-precision":[152],"training.":[153,197],"The":[154],"lengths":[156],"range":[157],"from":[158],"1k":[159],"8k":[161],"16k":[166],"without":[167],"being":[168],"out":[169],"memory.":[171],"As":[172],"end-to-end":[175],"performance,":[176],"1.26x":[181],"GPT-style":[185],"models,":[186],"which":[187],"demonstrates":[188],"enables":[191],"longer":[192],"LLM":[196]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
