{"id":"https://openalex.org/W7118185222","doi":"https://doi.org/10.1109/tpds.2025.3650515","title":"SMEStencil: Optimizing High-Order Stencils on ARM Multicore Using SME Unit","display_name":"SMEStencil: Optimizing High-Order Stencils on ARM Multicore Using SME Unit","publication_year":2026,"publication_date":"2026-01-05","ids":{"openalex":"https://openalex.org/W7118185222","doi":"https://doi.org/10.1109/tpds.2025.3650515"},"language":null,"primary_location":{"id":"doi:10.1109/tpds.2025.3650515","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2025.3650515","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102869516","display_name":"Yinuo Wang","orcid":"https://orcid.org/0009-0006-8117-4425"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yinuo Wang","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0006-8117-4425","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121917729","display_name":"Tianqi Mao","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tianqi Mao","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0005-6108-859X","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121937879","display_name":"Lin Gan","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lin Gan","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-3486-6016","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121501499","display_name":"Wubing Wan","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wubing Wan","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0001-5174-1832","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101941614","display_name":"Zeyu Song","orcid":"https://orcid.org/0000-0002-8975-5359"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zeyu Song","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109486609","display_name":"Jiayu Fu","orcid":"https://orcid.org/0009-0000-8610-2945"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiayu Fu","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121942645","display_name":"Lanke He","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lanke He","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Wenqiang Wang","orcid":"https://orcid.org/0000-0002-8842-6465"},"institutions":[{"id":"https://openalex.org/I4210112812","display_name":"National Supercomputing Center in Shenzhen","ror":"https://ror.org/02291hh73","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210112812"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenqiang Wang","raw_affiliation_strings":["National Supercomputing Center in Shenzhen, Shenzhen, China"],"raw_orcid":"https://orcid.org/0000-0002-8842-6465","affiliations":[{"raw_affiliation_string":"National Supercomputing Center in Shenzhen, Shenzhen, China","institution_ids":["https://openalex.org/I4210112812"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121958728","display_name":"Zekun Yin","orcid":null},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zekun Yin","raw_affiliation_strings":["Shandong University, Jinan, China"],"raw_orcid":"https://orcid.org/0000-0001-6002-0028","affiliations":[{"raw_affiliation_string":"Shandong University, Jinan, China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Wei Xue","orcid":"https://orcid.org/0000-0001-9740-6581"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Xue","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-9740-6581","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5120898923","display_name":"Guangwen Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangwen Yang","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-8673-8254","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.01790748,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"37","issue":"3","first_page":"651","last_page":"665"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9602000117301941,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9602000117301941,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.006899999920278788,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.00430000014603138,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/stencil","display_name":"Stencil","score":0.8871999979019165},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7351999878883362},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6934000253677368},{"id":"https://openalex.org/keywords/multi-core-processor","display_name":"Multi-core processor","score":0.5659999847412109},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.550000011920929},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.5081999897956848},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.46560001373291016},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4287000000476837},{"id":"https://openalex.org/keywords/performance-improvement","display_name":"Performance improvement","score":0.4004000127315521}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9017000198364258},{"id":"https://openalex.org/C76752949","wikidata":"https://www.wikidata.org/wiki/Q7607499","display_name":"Stencil","level":2,"score":0.8871999979019165},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7836999893188477},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7351999878883362},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6934000253677368},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.5659999847412109},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.550000011920929},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.5081999897956848},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.46560001373291016},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.4440999925136566},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4287000000476837},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.4004000127315521},{"id":"https://openalex.org/C2779851693","wikidata":"https://www.wikidata.org/wiki/Q183484","display_name":"Graphics processing unit","level":2,"score":0.3986999988555908},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.3887999951839447},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.3564000129699707},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.3368000090122223},{"id":"https://openalex.org/C79470037","wikidata":"https://www.wikidata.org/wiki/Q279748","display_name":"Out-of-core algorithm","level":2,"score":0.321399986743927},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.3156999945640564},{"id":"https://openalex.org/C49154492","wikidata":"https://www.wikidata.org/wiki/Q5300","display_name":"Central processing unit","level":2,"score":0.3050999939441681},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2953000068664551},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.2761000096797943},{"id":"https://openalex.org/C106515295","wikidata":"https://www.wikidata.org/wiki/Q26806595","display_name":"Parallel processing","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.27219998836517334},{"id":"https://openalex.org/C133875982","wikidata":"https://www.wikidata.org/wiki/Q764810","display_name":"Shared memory","level":2,"score":0.2655999958515167},{"id":"https://openalex.org/C202491316","wikidata":"https://www.wikidata.org/wiki/Q272683","display_name":"Instruction set","level":2,"score":0.26269999146461487},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.25699999928474426},{"id":"https://openalex.org/C139571649","wikidata":"https://www.wikidata.org/wiki/Q1156793","display_name":"Program optimization","level":3,"score":0.2556000053882599},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.2554999887943268}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpds.2025.3650515","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2025.3650515","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.4456086754798889,"id":"https://metadata.un.org/sdg/10"}],"awards":[{"id":"https://openalex.org/G1296432217","display_name":null,"funder_award_id":"U23A6007","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":44,"referenced_works":["https://openalex.org/W95654712","https://openalex.org/W1492882524","https://openalex.org/W1559264847","https://openalex.org/W1997147891","https://openalex.org/W2002555321","https://openalex.org/W2020832962","https://openalex.org/W2039115741","https://openalex.org/W2039378765","https://openalex.org/W2039417226","https://openalex.org/W2043068211","https://openalex.org/W2057897376","https://openalex.org/W2072625243","https://openalex.org/W2074833026","https://openalex.org/W2082423274","https://openalex.org/W2096886382","https://openalex.org/W2117410398","https://openalex.org/W2151764765","https://openalex.org/W2332459347","https://openalex.org/W2342875501","https://openalex.org/W2510992263","https://openalex.org/W2591110697","https://openalex.org/W2772612468","https://openalex.org/W2792247833","https://openalex.org/W2895580424","https://openalex.org/W2902566644","https://openalex.org/W2903775786","https://openalex.org/W2936463352","https://openalex.org/W2984920043","https://openalex.org/W2996929894","https://openalex.org/W3045710976","https://openalex.org/W3132357455","https://openalex.org/W3132763358","https://openalex.org/W3137777618","https://openalex.org/W3138973966","https://openalex.org/W4232301837","https://openalex.org/W4244567449","https://openalex.org/W4283029140","https://openalex.org/W4321636661","https://openalex.org/W4367277178","https://openalex.org/W4376632753","https://openalex.org/W4388105526","https://openalex.org/W4391987273","https://openalex.org/W4405756205","https://openalex.org/W4410321937"],"related_works":[],"abstract_inverted_index":{"Matrix-accelerated":[0],"stencil":[1,102],"computation":[2],"is":[3],"a":[4,77],"hot":[5],"research":[6],"topic,":[7],"yet":[8],"its":[9],"application":[10],"to":[11,57,71,82,134,147,158],"3":[12],"dimensional":[13],"(3D)":[14],"high-order":[15,43],"stencils":[16],"and":[17,36,54,64,76,104,114,151],"HPC":[18,149],"remains":[19],"underexplored.":[20],"With":[21],"the":[22,88,122,138],"emergence":[23],"of":[24,90],"Scalable":[25,51],"Matrix":[26],"Extension(SME)":[27],"on":[28,50,128],"ARMv9-A":[29],"CPU,":[30],"we":[31],"analyze":[32],"SME-based":[33],"accelerating":[34],"strategies":[35],"tailor":[37],"an":[38],"optimal":[39],"approach":[40],"for":[41],"3D":[42],"stencils.":[44],"We":[45,67],"introduce":[46],"algorithmic":[47],"optimizations":[48,70,144],"based":[49],"Vector":[52],"Extension(SVE)":[53],"SME":[55],"unit":[56],"address":[58],"strided":[59],"memory":[60,69,74],"accesses,":[61],"alignment":[62],"conflicts,":[63],"redundant":[65],"accesses.":[66],"propose":[68],"boost":[72],"on-package":[73],"efficiency,":[75],"novel":[78],"multi-thread":[79],"parallelism":[80],"paradigm":[81],"overcome":[83],"data-sharing":[84],"challenges":[85],"caused":[86],"by":[87,132,142],"absence":[89],"shared":[91],"data":[92],"caches.":[93],"SMEStencil":[94,124],"sustains":[95],"consistently":[96],"high":[97],"hardware":[98],"utilization":[99],"across":[100],"diverse":[101],"shapes":[103],"dimensions.":[105],"Our":[106],"DMA-based":[107],"inter-NUMA":[108],"communication":[109],"further":[110],"mitigates":[111],"NUMA":[112],"effects":[113],"MPI":[115],"limitations":[116],"in":[117],"hybrid":[118],"parallelism.":[119],"Combining":[120],"all":[121],"innovations,":[123],"outperforms":[125],"state-of-the-art":[126],"libraries":[127],"Nividia":[129],"A100":[130,165],"GPGPU":[131,166],"up":[133],"2.1\u00d7":[135],".":[136],"Moreover,":[137],"performance":[139],"improvements":[140],"enabled":[141],"our":[143],"translate":[145],"directly":[146],"real-world":[148,156],"applications":[150,157],"enable":[152],"Reverse":[153],"Time":[154],"Migration(RTM)":[155],"yield":[159],"1.8x":[160],"speedup":[161],"versus":[162],"highly-optimized":[163],"Nvidia":[164],"version.":[167]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-01-05T00:00:00"}
