{"id":"https://openalex.org/W4390692401","doi":"https://doi.org/10.1109/tpds.2024.3350368","title":"Optimizing Full-Spectrum Matrix Multiplications on ARMv8 Multi-Core CPUs","display_name":"Optimizing Full-Spectrum Matrix Multiplications on ARMv8 Multi-Core CPUs","publication_year":2024,"publication_date":"2024-01-10","ids":{"openalex":"https://openalex.org/W4390692401","doi":"https://doi.org/10.1109/tpds.2024.3350368"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2024.3350368","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2024.3350368","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5078057947","display_name":"Weiling Yang","orcid":"https://orcid.org/0000-0001-7167-4086"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Weiling Yang","raw_affiliation_strings":["School of Computer Science and Technology, National University of Defense Technology, Changsha, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083171604","display_name":"Jianbin Fang","orcid":"https://orcid.org/0000-0003-3542-4869"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianbin Fang","raw_affiliation_strings":["School of Computer Science and Technology, National University of Defense Technology, Changsha, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006729432","display_name":"Dezun Dong","orcid":"https://orcid.org/0000-0001-6243-8479"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dezun Dong","raw_affiliation_strings":["School of Computer Science and Technology, National University of Defense Technology, Changsha, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101673342","display_name":"Xing Su","orcid":"https://orcid.org/0000-0002-7514-1495"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xing Su","raw_affiliation_strings":["School of Computer Science and Technology, National University of Defense Technology, Changsha, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5030293790","display_name":"Zheng Wang","orcid":"https://orcid.org/0009-0001-7858-6238"},"institutions":[{"id":"https://openalex.org/I37802460","display_name":"Northwest University","ror":"https://ror.org/00z3td547","country_code":"CN","type":"education","lineage":["https://openalex.org/I37802460"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zheng Wang","raw_affiliation_strings":["Northwest University, Xi&#x2019;an, China"],"affiliations":[{"raw_affiliation_string":"Northwest University, Xi&#x2019;an, China","institution_ids":["https://openalex.org/I37802460"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5078057947"],"corresponding_institution_ids":["https://openalex.org/I170215575"],"apc_list":null,"apc_paid":null,"fwci":3.6299,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.93606432,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"35","issue":"3","first_page":"439","last_page":"454"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.994700014591217,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8904615044593811},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.8427727818489075},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6565586924552917},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.611096203327179},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.6094898581504822},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.596480667591095},{"id":"https://openalex.org/keywords/linear-algebra","display_name":"Linear algebra","score":0.5388796329498291},{"id":"https://openalex.org/keywords/subroutine","display_name":"Subroutine","score":0.5231714844703674},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.46379217505455017},{"id":"https://openalex.org/keywords/xeon-phi","display_name":"Xeon Phi","score":0.4284062087535858},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4105090796947479},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.32803672552108765},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.1926293671131134},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.18975427746772766}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8904615044593811},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.8427727818489075},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6565586924552917},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.611096203327179},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.6094898581504822},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.596480667591095},{"id":"https://openalex.org/C139352143","wikidata":"https://www.wikidata.org/wiki/Q82571","display_name":"Linear algebra","level":2,"score":0.5388796329498291},{"id":"https://openalex.org/C96147967","wikidata":"https://www.wikidata.org/wiki/Q190686","display_name":"Subroutine","level":2,"score":0.5231714844703674},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.46379217505455017},{"id":"https://openalex.org/C96972482","wikidata":"https://www.wikidata.org/wiki/Q1049168","display_name":"Xeon Phi","level":2,"score":0.4284062087535858},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4105090796947479},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.32803672552108765},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.1926293671131134},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.18975427746772766},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpds.2024.3350368","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2024.3350368","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4583476002","display_name":null,"funder_award_id":"61872294","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G503725912","display_name":null,"funder_award_id":"61972408","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":47,"referenced_works":["https://openalex.org/W1983157164","https://openalex.org/W1986007546","https://openalex.org/W2038080797","https://openalex.org/W2043275593","https://openalex.org/W2073061372","https://openalex.org/W2084379367","https://openalex.org/W2186615578","https://openalex.org/W2194775991","https://openalex.org/W2252007067","https://openalex.org/W2293241800","https://openalex.org/W2516525699","https://openalex.org/W2570343428","https://openalex.org/W2620838383","https://openalex.org/W2786374423","https://openalex.org/W2896652512","https://openalex.org/W2924040443","https://openalex.org/W2939501468","https://openalex.org/W2949953997","https://openalex.org/W2951894856","https://openalex.org/W2964017942","https://openalex.org/W2971383048","https://openalex.org/W2995113061","https://openalex.org/W3016542674","https://openalex.org/W3041159379","https://openalex.org/W3042343931","https://openalex.org/W3044796228","https://openalex.org/W3086476857","https://openalex.org/W3091995851","https://openalex.org/W3099506637","https://openalex.org/W3123054690","https://openalex.org/W3127904641","https://openalex.org/W3132840842","https://openalex.org/W3136479147","https://openalex.org/W3156745629","https://openalex.org/W3176085048","https://openalex.org/W3209244307","https://openalex.org/W3210601829","https://openalex.org/W4244254628","https://openalex.org/W4288083057","https://openalex.org/W4320067944","https://openalex.org/W6631660994","https://openalex.org/W6637373629","https://openalex.org/W6684191040","https://openalex.org/W6686509673","https://openalex.org/W6745834200","https://openalex.org/W6761975575","https://openalex.org/W6779728309"],"related_works":["https://openalex.org/W2391861012","https://openalex.org/W1495480229","https://openalex.org/W1585000785","https://openalex.org/W3121314575","https://openalex.org/W3147497457","https://openalex.org/W4206336957","https://openalex.org/W4225552076","https://openalex.org/W2550374317","https://openalex.org/W1973961853","https://openalex.org/W3004823601"],"abstract_inverted_index":{"General":[0],"Matrix":[1],"Multiplication":[2],"(GEMM)":[3],"is":[4,58,101,110],"a":[5],"key":[6],"subroutine":[7],"in":[8,42,104],"high-performance":[9],"computing.":[10],"While":[11],"the":[12,97,114,137,143],"mainstream":[13,164],"Basic":[14],"Linear":[15],"Algebra":[16],"Subprograms":[17],"(BLAS)":[18],"libraries":[19],"can":[20],"deliver":[21],"good":[22],"performance":[23,53],"on":[24,50,54,64,69],"large":[25],"and":[26,35,88,122,139,159,183],"regular-shaped":[27,90],"GEMMs,":[28,37,84],"they":[29],"are":[30,39],"inadequate":[31],"for":[32,62,119,199],"optimizing":[33],"small":[34,124],"irregular-shaped":[36],"which":[38,100],"commonly":[40],"seen":[41],"emerging":[43,65],"HPC":[44,66,105],"applications.":[45],"Recent":[46],"research":[47],"has":[48],"focused":[49],"improving":[51],"GEMM":[52,132,144,181],"GPUs,":[55],"but":[56],"there":[57],"still":[59],"significant":[60],"room":[61],"improvement":[63],"hardware":[67,184],"based":[68],"multi-core":[70,157],"CPUs.":[71],"We":[72,146,186],"present":[73],"<sc":[74,92,107,148,172,190],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[75,93,108,149,173,191],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">LibShalom2</small>":[76,94,109,150,174,192],",":[77],"an":[78,194],"open-source":[79],"library":[80],"to":[81,112,130,154],"optimize":[82],"full-spectrum":[83,180],"taking":[85],"small,":[86],"irregular-shaped,":[87],"large-scale":[89],"matrices.":[91,125],"explicitly":[95],"targets":[96],"ARMv8":[98,156],"architecture,":[99],"becoming":[102],"common":[103],"systems.":[106],"designed":[111],"minimize":[113],"expensive":[115],"memory":[116],"accessing":[117],"overhead":[118],"data":[120],"packing":[121],"processing":[123],"It":[126],"uses":[127],"analytic":[128],"methods":[129],"determine":[131],"kernel":[133],"optimization":[134],"parameters,":[135],"enhancing":[136],"computation":[138],"parallelization":[140],"efficiency":[141],"of":[142,197],"kernels.":[145],"evaluate":[147],"by":[151],"applying":[152],"it":[153,161],"three":[155],"architectures":[158],"comparing":[160],"against":[162],"five":[163],"linear":[165],"algebra":[166],"libraries.":[167],"Experimental":[168],"results":[169],"show":[170,188],"that":[171,189],"consistently":[175],"outperforms":[176],"existing":[177],"solutions":[178],"across":[179],"workloads":[182],"architectures.":[185],"also":[187],"delivers":[193],"average":[195],"speedup":[196],"2.2x":[198],"real-life":[200],"neural":[201],"network":[202],"workloads.":[203]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
