{"id":"https://openalex.org/W7119508007","doi":"https://doi.org/10.3390/computers15010039","title":"Efficient Low-Precision GEMM on Ascend NPU: HGEMM\u2019s Synergy of Pipeline Scheduling, Tiling, and Memory Optimization","display_name":"Efficient Low-Precision GEMM on Ascend NPU: HGEMM\u2019s Synergy of Pipeline Scheduling, Tiling, and Memory Optimization","publication_year":2026,"publication_date":"2026-01-08","ids":{"openalex":"https://openalex.org/W7119508007","doi":"https://doi.org/10.3390/computers15010039"},"language":"en","primary_location":{"id":"doi:10.3390/computers15010039","is_oa":true,"landing_page_url":"https://doi.org/10.3390/computers15010039","pdf_url":"https://www.mdpi.com/2073-431X/15/1/39/pdf?version=1767886326","source":{"id":"https://openalex.org/S4210228075","display_name":"Computers","issn_l":"2073-431X","issn":["2073-431X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computers","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.mdpi.com/2073-431X/15/1/39/pdf?version=1767886326","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5122308788","display_name":"Erkun Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210136793","display_name":"Peng Cheng Laboratory","ror":"https://ror.org/03qdqbt06","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210136793"]},{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Erkun Zhang","raw_affiliation_strings":["Pengcheng Laboratory, Shenzhen 518071, China","School of Future Technology, South China University of Technology, Guangzhou 510641, China"],"raw_orcid":"https://orcid.org/0000-0001-9872-3095","affiliations":[{"raw_affiliation_string":"Pengcheng Laboratory, Shenzhen 518071, China","institution_ids":["https://openalex.org/I4210136793"]},{"raw_affiliation_string":"School of Future Technology, South China University of Technology, Guangzhou 510641, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122328017","display_name":"Pengxiang Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210136793","display_name":"Peng Cheng Laboratory","ror":"https://ror.org/03qdqbt06","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210136793"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pengxiang Xu","raw_affiliation_strings":["Pengcheng Laboratory, Shenzhen 518071, China"],"raw_orcid":"https://orcid.org/0000-0002-2273-1504","affiliations":[{"raw_affiliation_string":"Pengcheng Laboratory, Shenzhen 518071, China","institution_ids":["https://openalex.org/I4210136793"]}]},{"author_position":"last","author":{"id":null,"display_name":"Lu Lu","orcid":"https://orcid.org/0000-0001-6372-7088"},"institutions":[{"id":"https://openalex.org/I4210136793","display_name":"Peng Cheng Laboratory","ror":"https://ror.org/03qdqbt06","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210136793"]},{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Lu Lu","raw_affiliation_strings":["Pengcheng Laboratory, Shenzhen 518071, China","School of Computer Science & Engineering, South China University of Technology, Guangzhou 510006, China"],"raw_orcid":"https://orcid.org/0000-0001-6372-7088","affiliations":[{"raw_affiliation_string":"Pengcheng Laboratory, Shenzhen 518071, China","institution_ids":["https://openalex.org/I4210136793"]},{"raw_affiliation_string":"School of Computer Science & Engineering, South China University of Technology, Guangzhou 510006, China","institution_ids":["https://openalex.org/I90610280"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5122308788"],"corresponding_institution_ids":["https://openalex.org/I4210136793","https://openalex.org/I90610280"],"apc_list":{"value":1600,"currency":"CHF","value_usd":1732},"apc_paid":{"value":1600,"currency":"CHF","value_usd":1732},"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.04184688,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"15","issue":"1","first_page":"39","last_page":"39"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.2745000123977661,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.2745000123977661,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.21089999377727509,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.09109999984502792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.845300018787384},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5254999995231628},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5059999823570251},{"id":"https://openalex.org/keywords/padding","display_name":"Padding","score":0.5034000277519226},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.461899995803833},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.454800009727478},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.4196999967098236},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.41350001096725464}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.883899986743927},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.845300018787384},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5760999917984009},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5254999995231628},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5059999823570251},{"id":"https://openalex.org/C165435473","wikidata":"https://www.wikidata.org/wiki/Q1509884","display_name":"Padding","level":2,"score":0.5034000277519226},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.461899995803833},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.454800009727478},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.4196999967098236},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.41350001096725464},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.4104999899864197},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.39910000562667847},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.37130001187324524},{"id":"https://openalex.org/C76970557","wikidata":"https://www.wikidata.org/wiki/Q1869750","display_name":"Loop unrolling","level":3,"score":0.3292999863624573},{"id":"https://openalex.org/C2781215313","wikidata":"https://www.wikidata.org/wiki/Q3493345","display_name":"SPARK (programming language)","level":2,"score":0.31619998812675476},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.314300000667572},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3140000104904175},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.31349998712539673},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2903999984264374},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.28119999170303345},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.2703000009059906},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.2687000036239624},{"id":"https://openalex.org/C175309249","wikidata":"https://www.wikidata.org/wiki/Q725864","display_name":"Pipeline transport","level":2,"score":0.26600000262260437},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2621000111103058},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.257099986076355},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2567000091075897}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.3390/computers15010039","is_oa":true,"landing_page_url":"https://doi.org/10.3390/computers15010039","pdf_url":"https://www.mdpi.com/2073-431X/15/1/39/pdf?version=1767886326","source":{"id":"https://openalex.org/S4210228075","display_name":"Computers","issn_l":"2073-431X","issn":["2073-431X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computers","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:78d68306b499477b8fcb948864f341aa","is_oa":true,"landing_page_url":"https://doaj.org/article/78d68306b499477b8fcb948864f341aa","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Computers, Vol 15, Iss 1, p 39 (2026)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.3390/computers15010039","is_oa":true,"landing_page_url":"https://doi.org/10.3390/computers15010039","pdf_url":"https://www.mdpi.com/2073-431X/15/1/39/pdf?version=1767886326","source":{"id":"https://openalex.org/S4210228075","display_name":"Computers","issn_l":"2073-431X","issn":["2073-431X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computers","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9","score":0.44306060671806335}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7119508007.pdf","grobid_xml":"https://content.openalex.org/works/W7119508007.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"one":[1],"of":[2,26,42,54,70,83,113,149],"the":[3,23,38,87,127,147,170,186,210,217,227],"most":[4],"widely":[5],"used":[6],"high-performance":[7],"kernels,":[8],"General":[9],"Matrix":[10],"Multiplication,":[11],"or":[12],"GEMM,":[13],"plays":[14],"a":[15,111,117,200],"pivotal":[16],"role":[17],"in":[18,230],"diverse":[19],"application":[20],"fields.":[21],"With":[22],"growing":[24],"prevalence":[25],"training":[27],"for":[28,61,126],"Convolutional":[29],"Neural":[30,48],"Networks":[31],"(CNNs)":[32],"and":[33,40,77,101,107,116,132,135,153,179],"Large":[34],"Language":[35],"Models":[36],"(LLMs),":[37],"design":[39],"implementation":[41,173,188,220,229],"high-efficiency,":[43],"low-precision":[44],"GEMM":[45],"on":[46,189],"modern":[47],"Processing":[49],"Unit":[50],"(NPU)":[51],"platforms":[52],"are":[53,86,122,143],"great":[55],"significance.":[56],"In":[57],"this":[58,84],"work,":[59],"HGEMM":[60,163,212],"Ascend":[62,176],"NPU":[63,177],"is":[64,93],"presented,":[65],"which":[66,95],"enables":[67],"collaborative":[68],"processing":[69],"different":[71],"computation":[72],"types":[73],"by":[74],"Cube":[75],"units":[76],"Vector":[78],"units.":[79],"The":[80],"major":[81],"contributions":[82],"work":[85],"following:":[88],"(i)":[89],"dual-stream":[90],"pipeline":[91],"scheduling":[92],"implemented,":[94],"synchronizes":[96],"padding":[97],"operations,":[98],"matrix\u2013matrix":[99],"multiplications,":[100],"element-wise":[102],"instructions":[103],"across":[104],"hierarchical":[105],"buffers":[106],"compute":[108],"units;":[109],"(ii)":[110],"suite":[112],"tiling":[114],"strategies":[115],"corresponding":[118],"strategy":[119],"selection":[120],"mechanism":[121],"developed,":[123],"comprehensively":[124],"accounting":[125],"impacts":[128],"from":[129],"M,":[130],"N,":[131],"K":[133],"directions;":[134],"(iii)":[136],"SplitK":[137],"as":[138,140],"well":[139],"ShuffleK":[141],"methods":[142],"raised":[144],"to":[145,185,226],"address":[146],"challenges":[148],"memory":[150],"access":[151],"efficiency":[152,224],"AI":[154],"Core":[155],"utilization.":[156],"Extensive":[157],"evaluations":[158],"demonstrate":[159],"that":[160],"our":[161],"proposed":[162,211],"achieves":[164,199],"an":[165,180],"average":[166,181],"3.56\u00d7":[167],"speedup":[168,183],"over":[169],"CATLASS":[171,218],"template-based":[172,219],"under":[174,193,206],"identical":[175],"configurations,":[178],"2.10\u00d7":[182],"relative":[184],"cuBLAS":[187,228],"Nvidia":[190],"A800":[191],"GPUs":[192],"general":[194],"random":[195],"workloads.":[196,208,235],"It":[197],"also":[198,222],"maximum":[201],"computational":[202],"utilization":[203],"exceeding":[204],"90%":[205],"benchmark":[207],"Moreover,":[209],"not":[213],"only":[214],"significantly":[215],"outperforms":[216],"but":[221],"delivers":[223],"comparable":[225],"OPT-based":[231],"bandwidth-limited":[232],"LLM":[233],"inference":[234]},"counts_by_year":[],"updated_date":"2026-05-06T08:25:59.206177","created_date":"2026-01-09T00:00:00"}
