{"id":"https://openalex.org/W4416146791","doi":"https://doi.org/10.1145/3712285.3759769","title":"HStencil: Matrix-Vector Stencil Computation with Interleaved Outer Product and MLA","display_name":"HStencil: Matrix-Vector Stencil Computation with Interleaved Outer Product and MLA","publication_year":2025,"publication_date":"2025-11-12","ids":{"openalex":"https://openalex.org/W4416146791","doi":"https://doi.org/10.1145/3712285.3759769"},"language":null,"primary_location":{"id":"doi:10.1145/3712285.3759769","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3712285.3759769","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5042598444","display_name":"Han Jie Huang","orcid":"https://orcid.org/0009-0009-3774-9115"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Han Huang","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067845268","display_name":"Jiabin Xie","orcid":"https://orcid.org/0000-0003-0770-3086"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiabin Xie","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088979128","display_name":"Guangnan Feng","orcid":"https://orcid.org/0000-0002-1382-280X"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangnan Feng","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039843650","display_name":"Xianwei Zhang","orcid":"https://orcid.org/0000-0003-3507-4299"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xianwei Zhang","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041534890","display_name":"Dan Huang","orcid":"https://orcid.org/0000-0001-5582-1031"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dan Huang","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101483479","display_name":"Zhiguang Chen","orcid":"https://orcid.org/0000-0002-9318-5715"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiguang Chen","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101633465","display_name":"Yutong Lu","orcid":"https://orcid.org/0000-0001-5315-3375"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yutong Lu","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, Guangdong, China","institution_ids":["https://openalex.org/I157773358"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5042598444"],"corresponding_institution_ids":["https://openalex.org/I157773358"],"apc_list":null,"apc_paid":null,"fwci":4.5405,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.95085559,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1816","last_page":"1829"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9659000039100647,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9659000039100647,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.0052999998442828655,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.004100000020116568,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/stencil","display_name":"Stencil","score":0.954200029373169},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.5845000147819519},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5634999871253967},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5600000023841858},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.5394999980926514},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.4966000020503998},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.4507000148296356},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.40070000290870667}],"concepts":[{"id":"https://openalex.org/C76752949","wikidata":"https://www.wikidata.org/wiki/Q7607499","display_name":"Stencil","level":2,"score":0.954200029373169},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8270999789237976},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7189000248908997},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.5845000147819519},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5634999871253967},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5600000023841858},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.5394999980926514},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.4966000020503998},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.4507000148296356},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.40070000290870667},{"id":"https://openalex.org/C133588205","wikidata":"https://www.wikidata.org/wiki/Q28455645","display_name":"Instruction prefetch","level":3,"score":0.36340001225471497},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.34130001068115234},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.3246000111103058},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.3160000145435333},{"id":"https://openalex.org/C90673727","wikidata":"https://www.wikidata.org/wiki/Q901718","display_name":"Product (mathematics)","level":2,"score":0.31439998745918274},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.2913999855518341},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.28760001063346863},{"id":"https://openalex.org/C2778770139","wikidata":"https://www.wikidata.org/wiki/Q1966904","display_name":"Solver","level":2,"score":0.27649998664855957},{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.27309998869895935},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.26159998774528503},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.25859999656677246},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.2574999928474426}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3712285.3759769","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3712285.3759769","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W1559264847","https://openalex.org/W2039417226","https://openalex.org/W2171006257","https://openalex.org/W2323909431","https://openalex.org/W2901549770","https://openalex.org/W2952562115","https://openalex.org/W3037775421","https://openalex.org/W3129397352","https://openalex.org/W3132532188","https://openalex.org/W3147517636","https://openalex.org/W3174843332","https://openalex.org/W3204728131","https://openalex.org/W3205799311","https://openalex.org/W4200091031","https://openalex.org/W4283029140","https://openalex.org/W4312309118","https://openalex.org/W4318603197","https://openalex.org/W4321446155","https://openalex.org/W4381748048","https://openalex.org/W4388661983","https://openalex.org/W4388904208","https://openalex.org/W4391971595","https://openalex.org/W4391987273","https://openalex.org/W4395106472","https://openalex.org/W4395959113","https://openalex.org/W4399304499","https://openalex.org/W4399527102","https://openalex.org/W4399851697","https://openalex.org/W4400151937","https://openalex.org/W4400409888","https://openalex.org/W4401408710","https://openalex.org/W4401862084","https://openalex.org/W4405756173","https://openalex.org/W4405756205"],"related_works":[],"abstract_inverted_index":{"Stencil":[0],"computations":[1],"are":[2],"fundamental":[3],"to":[4,25,41,50,92,103,111],"various":[5],"HPC":[6],"and":[7,57,73,89,107],"intelligent":[8],"computing":[9,36,45,69],"applications,":[10],"often":[11],"consuming":[12],"significant":[13],"execution":[14,102],"time.":[15],"The":[16],"emergence":[17],"of":[18,131],"specialized":[19],"matrix":[20,31,72,88],"units":[21,33,91],"presents":[22],"new":[23],"opportunities":[24],"accelerate":[26],"stencil":[27,68],"computations.":[28],"While":[29],"scalable":[30],"compute":[32],"provide":[34],"substantial":[35],"horsepower,":[37],"prior":[38],"efforts":[39],"fail":[40],"fully":[42],"utilize":[43],"the":[44],"capabilities":[46],"for":[47],"stencils":[48],"due":[49],"suboptimal":[51],"matrix-unit":[52],"utilization,":[53],"limited":[54],"instruction-level":[55,105],"parallelism,":[56],"low":[58],"cache":[59,119],"hit":[60],"rates.":[61],"This":[62],"paper":[63],"introduces":[64],"HStencil,":[65],"a":[66],"novel":[67],"framework":[70],"utilizing":[71],"vector":[74,90],"units.":[75],"HStencil":[76,127],"addresses":[77],"these":[78],"challenges":[79],"through":[80],"three":[81],"contributions:":[82],"1)":[83],"microkernels":[84],"that":[85,126],"jointly":[86],"leverage":[87],"enhance":[93,104],"hardware":[94],"utilization;":[95],"2)":[96],"fine-grained":[97],"instruction":[98],"scheduling":[99],"with":[100],"interleaved":[101],"parallelism;":[106],"3)":[108],"spatial":[109],"prefetch":[110],"sustain":[112],"high":[113],"performance":[114,146],"when":[115],"working":[116],"sets":[117],"exceed":[118],"capacity.":[120],"Evaluations":[121],"on":[122],"representative":[123],"benchmarks":[124],"demonstrate":[125],"achieves":[128],"maximum":[129],"speedups":[130],"1.81x":[132],"\u2013":[133],"5.76x":[134],"over":[135],"auto-vectorization":[136],"across":[137],"different":[138],"CPU":[139],"platforms,":[140],"delivers":[141],"31%":[142],"-":[143],"91%":[144],"higher":[145],"versus":[147],"state-of-the-art":[148],"methods.":[149]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-11-12T00:00:00"}
