{"id":"https://openalex.org/W4399304422","doi":"https://doi.org/10.1145/3650200.3656620","title":"Optimizing Attention by Exploiting Data Reuse on ARM Multi-core CPUs","display_name":"Optimizing Attention by Exploiting Data Reuse on ARM Multi-core CPUs","publication_year":2024,"publication_date":"2024-05-30","ids":{"openalex":"https://openalex.org/W4399304422","doi":"https://doi.org/10.1145/3650200.3656620"},"language":"en","primary_location":{"id":"doi:10.1145/3650200.3656620","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3650200.3656620","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3650200.3656620","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 38th ACM International Conference on Supercomputing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3650200.3656620","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5016835806","display_name":"Xiao Fu","orcid":"https://orcid.org/0009-0000-7370-391X"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xiao Fu","raw_affiliation_strings":["National University of Defense Technology, China"],"raw_orcid":"https://orcid.org/0009-0000-7370-391X","affiliations":[{"raw_affiliation_string":"National University of Defense Technology, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078057947","display_name":"Weiling Yang","orcid":"https://orcid.org/0000-0001-7167-4086"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weiling Yang","raw_affiliation_strings":["National University of Defense Technology, China"],"raw_orcid":"https://orcid.org/0000-0001-7167-4086","affiliations":[{"raw_affiliation_string":"National University of Defense Technology, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006729432","display_name":"Dezun Dong","orcid":"https://orcid.org/0000-0001-6243-8479"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dezun Dong","raw_affiliation_strings":["National University of Defense Technology, China"],"raw_orcid":"https://orcid.org/0000-0001-6243-8479","affiliations":[{"raw_affiliation_string":"National University of Defense Technology, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101673342","display_name":"Xing Su","orcid":null},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xing Su","raw_affiliation_strings":["National University of Defense Technology, China"],"raw_orcid":"https://orcid.org/0000-0002-7514-1495","affiliations":[{"raw_affiliation_string":"National University of Defense Technology, China","institution_ids":["https://openalex.org/I170215575"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5016835806"],"corresponding_institution_ids":["https://openalex.org/I170215575"],"apc_list":null,"apc_paid":null,"fwci":1.6665,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.84804688,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"137","last_page":"149"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9954000115394592,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8797211647033691},{"id":"https://openalex.org/keywords/x86","display_name":"x86","score":0.6731946468353271},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.5909465551376343},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5578987002372742},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5064579248428345},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.4410797357559204},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.4270198345184326},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.4178849756717682},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.14965161681175232}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8797211647033691},{"id":"https://openalex.org/C170723468","wikidata":"https://www.wikidata.org/wiki/Q182933","display_name":"x86","level":3,"score":0.6731946468353271},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.5909465551376343},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5578987002372742},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5064579248428345},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.4410797357559204},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.4270198345184326},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.4178849756717682},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.14965161681175232},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3650200.3656620","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3650200.3656620","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3650200.3656620","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 38th ACM International Conference on Supercomputing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3650200.3656620","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3650200.3656620","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3650200.3656620","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 38th ACM International Conference on Supercomputing","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.6399999856948853,"display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4399304422.pdf"},"referenced_works_count":42,"referenced_works":["https://openalex.org/W1555915743","https://openalex.org/W1983157164","https://openalex.org/W2073061372","https://openalex.org/W2252007067","https://openalex.org/W2477528380","https://openalex.org/W2516525699","https://openalex.org/W2804032941","https://openalex.org/W2913790721","https://openalex.org/W2924040443","https://openalex.org/W2946383579","https://openalex.org/W2970971581","https://openalex.org/W2971383048","https://openalex.org/W3035582633","https://openalex.org/W3042627863","https://openalex.org/W3097283637","https://openalex.org/W3123054690","https://openalex.org/W3127904641","https://openalex.org/W3130716829","https://openalex.org/W3145384893","https://openalex.org/W3156745629","https://openalex.org/W3166510811","https://openalex.org/W3210432446","https://openalex.org/W3210601829","https://openalex.org/W3211028753","https://openalex.org/W4205662846","https://openalex.org/W4214669216","https://openalex.org/W4244254628","https://openalex.org/W4250470790","https://openalex.org/W4281758439","https://openalex.org/W4282962596","https://openalex.org/W4289828096","https://openalex.org/W4303645613","https://openalex.org/W4311327542","https://openalex.org/W4311543031","https://openalex.org/W4320067900","https://openalex.org/W4366957282","https://openalex.org/W4378697133","https://openalex.org/W4384705353","https://openalex.org/W4386191499","https://openalex.org/W4388031352","https://openalex.org/W4390692401","https://openalex.org/W4391409448"],"related_works":["https://openalex.org/W17155033","https://openalex.org/W3207760230","https://openalex.org/W1496222301","https://openalex.org/W1590307681","https://openalex.org/W2536018345","https://openalex.org/W4312814274","https://openalex.org/W4285370786","https://openalex.org/W2296488620","https://openalex.org/W1016402482","https://openalex.org/W1987840949"],"abstract_inverted_index":{"Transformers":[0],"reign":[1],"supreme":[2],"in":[3,11,55,74],"natural":[4],"language":[5],"processing,":[6],"representing":[7],"a":[8,83,103],"milestone":[9],"innovation":[10],"deep":[12],"learning.":[13],"For":[14],"high-performance":[15],"model":[16,113],"inference,":[17],"optimizing":[18],"the":[19,27,36,126],"time-consuming":[20],"attention":[21,37,85],"module":[22],"is":[23,39,114],"crucial.":[24],"Owing":[25],"to":[26,49,91,116,125,137],"irregular-shaped":[28],"matrix":[29],"workloads":[30],"and":[31,88,102,121,131,145,161],"intricate":[32],"data":[33,105],"access":[34,52],"patterns,":[35],"operator":[38],"bounded":[40],"by":[41],"memory":[42,51,129],"bandwidth.":[43],"Existing":[44],"works":[45],"utilize":[46],"kernel":[47],"fusion":[48,86],"reduce":[50],"overhead,":[53],"resulting":[54],"promising":[56],"performance":[57],"enhancements.":[58],"However,":[59],"these":[60],"efforts":[61],"primarily":[62],"focus":[63],"on":[64,99],"GPU":[65],"or":[66],"X86":[67],"architectures,":[68],"leaving":[69],"ARM":[70,93,140],"multi-cores,":[71],"commonly":[72],"encountered":[73],"emerging":[75],"HPC":[76],"systems,":[77],"insufficiently":[78],"explored.":[79],"We":[80,134],"present":[81],"MEATTEN,":[82],"memory-efficient":[84],"scheme":[87],"batched":[89,122],"approach":[90,152],"exploit":[92],"multi-core":[94],"CPUs":[95],"effectively.":[96],"It":[97],"builds":[98],"fused":[100],"micro-kernels":[101],"new":[104],"layout":[106],"suitable":[107],"for":[108],"SIMD":[109],"vectorization.":[110],"An":[111],"analytic":[112],"used":[115],"guide":[117],"loop":[118],"permutation,":[119],"tiling,":[120],"parallelization":[123],"according":[124],"on-chip":[127],"hierarchical":[128],"architecture":[130],"workload":[132],"characterization.":[133],"apply":[135],"MEATTEN":[136],"three":[138],"representative":[139],"multi-cores":[141],"against":[142],"state-of-the-art":[143],"libraries":[144],"compilers.":[146],"Experimental":[147],"results":[148],"demonstrate":[149],"that":[150],"our":[151],"consistently":[153],"outperforms":[154],"prior":[155],"approaches":[156],"across":[157],"various":[158],"evaluation":[159],"scenarios":[160],"platforms.":[162]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
