{"id":"https://openalex.org/W7119502335","doi":"https://doi.org/10.1145/3773656.3773678","title":"Tensor-Core-Optimized Strategies for BLR \u00d7 Tall-Skinny Matrix Multiplication in BEM","display_name":"Tensor-Core-Optimized Strategies for BLR \u00d7 Tall-Skinny Matrix Multiplication in BEM","publication_year":2026,"publication_date":"2026-01-09","ids":{"openalex":"https://openalex.org/W7119502335","doi":"https://doi.org/10.1145/3773656.3773678"},"language":null,"primary_location":{"id":"doi:10.1145/3773656.3773678","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3773656.3773678","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3773656.3773678","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5042523389","display_name":"Akihiro Ida","orcid":"https://orcid.org/0000-0001-7751-1093"},"institutions":[{"id":"https://openalex.org/I1315852903","display_name":"Japan Agency for Marine-Earth Science and Technology","ror":"https://ror.org/059qg2m13","country_code":"JP","type":"facility","lineage":["https://openalex.org/I1315852903"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Akihiro Ida","raw_affiliation_strings":["Japan Agency for Marine-Earth Science and Technology, Yokohama, Japan"],"raw_orcid":"https://orcid.org/0000-0001-7751-1093","affiliations":[{"raw_affiliation_string":"Japan Agency for Marine-Earth Science and Technology, Yokohama, Japan","institution_ids":["https://openalex.org/I1315852903"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100978979","display_name":"Kazuya Goto","orcid":null},"institutions":[{"id":"https://openalex.org/I4210148419","display_name":"National Tax College","ror":"https://ror.org/03z692463","country_code":"JP","type":"education","lineage":["https://openalex.org/I4210148419"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Kazuya Goto","raw_affiliation_strings":["PExProCS, LLC, Tokyo, Japan"],"raw_orcid":"https://orcid.org/0009-0000-4146-8134","affiliations":[{"raw_affiliation_string":"PExProCS, LLC, Tokyo, Japan","institution_ids":["https://openalex.org/I4210148419"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025266920","display_name":"Rio Yokota","orcid":null},"institutions":[{"id":"https://openalex.org/I4400009020","display_name":"Institute of Science Tokyo","ror":"https://ror.org/05dqf9946","country_code":null,"type":"education","lineage":["https://openalex.org/I4400009020"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Rio Yokota","raw_affiliation_strings":["Institute of Science Tokyo, Tokyo, Japan"],"raw_orcid":"https://orcid.org/0000-0001-7573-7873","affiliations":[{"raw_affiliation_string":"Institute of Science Tokyo, Tokyo, Japan","institution_ids":["https://openalex.org/I4400009020"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029599833","display_name":"Tasuku Hiraishi","orcid":"https://orcid.org/0000-0003-4285-893X"},"institutions":[{"id":"https://openalex.org/I52765264","display_name":"Kyoto Tachibana University","ror":"https://ror.org/02e2wvy23","country_code":"JP","type":"education","lineage":["https://openalex.org/I52765264"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Tasuku Hiraishi","raw_affiliation_strings":["Kyoto Tachibana University, Kyoto, Japan"],"raw_orcid":"https://orcid.org/0000-0003-4285-893X","affiliations":[{"raw_affiliation_string":"Kyoto Tachibana University, Kyoto, Japan","institution_ids":["https://openalex.org/I52765264"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005033209","display_name":"Toshihiro Hanawa","orcid":null},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Toshihiro Hanawa","raw_affiliation_strings":["The University of Tokyo, Kashiwa, Japan"],"raw_orcid":"https://orcid.org/0000-0002-2970-6037","affiliations":[{"raw_affiliation_string":"The University of Tokyo, Kashiwa, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090608549","display_name":"Takeshi Iwashita","orcid":"https://orcid.org/0000-0003-1938-1723"},"institutions":[{"id":"https://openalex.org/I22299242","display_name":"Kyoto University","ror":"https://ror.org/02kpeqv85","country_code":"JP","type":"education","lineage":["https://openalex.org/I22299242"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Takeshi Iwashita","raw_affiliation_strings":["Kyoto University, Kyoto, Japan"],"raw_orcid":"https://orcid.org/0000-0003-1938-1723","affiliations":[{"raw_affiliation_string":"Kyoto University, Kyoto, Japan","institution_ids":["https://openalex.org/I22299242"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009492561","display_name":"Masatoshi Kawai","orcid":"https://orcid.org/0009-0003-1454-202X"},"institutions":[{"id":"https://openalex.org/I201537933","display_name":"Tohoku University","ror":"https://ror.org/01dq60k83","country_code":"JP","type":"education","lineage":["https://openalex.org/I201537933"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Masatoshi Kawai","raw_affiliation_strings":["Tohoku University, Sendai, Japan"],"raw_orcid":"https://orcid.org/0009-0003-1454-202X","affiliations":[{"raw_affiliation_string":"Tohoku University, Sendai, Japan","institution_ids":["https://openalex.org/I201537933"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080142917","display_name":"Satoshi Ohshima","orcid":"https://orcid.org/0000-0003-4073-5688"},"institutions":[{"id":"https://openalex.org/I135598925","display_name":"Kyushu University","ror":"https://ror.org/00p4k0j84","country_code":"JP","type":"education","lineage":["https://openalex.org/I135598925"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Satoshi Ohshima","raw_affiliation_strings":["Kyushu University, Fukuoka, Japan"],"raw_orcid":"https://orcid.org/0000-0003-4073-5688","affiliations":[{"raw_affiliation_string":"Kyushu University, Fukuoka, Japan","institution_ids":["https://openalex.org/I135598925"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101918467","display_name":"Tetsuya Hoshino","orcid":"https://orcid.org/0009-0004-5349-6852"},"institutions":[{"id":"https://openalex.org/I60134161","display_name":"Nagoya University","ror":"https://ror.org/04chrp450","country_code":"JP","type":"education","lineage":["https://openalex.org/I60134161"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Tetsuya Hoshino","raw_affiliation_strings":["Nagoya University, Nagoya, Japan"],"raw_orcid":"https://orcid.org/0009-0004-5349-6852","affiliations":[{"raw_affiliation_string":"Nagoya University, Nagoya, Japan","institution_ids":["https://openalex.org/I60134161"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":18.75,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.9599466,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"153","last_page":"164"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.33329999446868896,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.33329999446868896,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.27129998803138733,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10792","display_name":"Matrix Theory and Algorithms","score":0.11330000311136246,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.6690000295639038},{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.5981000065803528},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.5838000178337097},{"id":"https://openalex.org/keywords/dimension","display_name":"Dimension (graph theory)","score":0.5482000112533569},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.544700026512146},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.5281000137329102},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.4828000068664551},{"id":"https://openalex.org/keywords/stack","display_name":"Stack (abstract data type)","score":0.46549999713897705},{"id":"https://openalex.org/keywords/tensor-product","display_name":"Tensor product","score":0.42809998989105225}],"concepts":[{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.6690000295639038},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.5981000065803528},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.5838000178337097},{"id":"https://openalex.org/C33676613","wikidata":"https://www.wikidata.org/wiki/Q13415176","display_name":"Dimension (graph theory)","level":2,"score":0.5482000112533569},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.544700026512146},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.541100025177002},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.5356000065803528},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.5281000137329102},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.508400022983551},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.48510000109672546},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.4828000068664551},{"id":"https://openalex.org/C9395851","wikidata":"https://www.wikidata.org/wiki/Q177929","display_name":"Stack (abstract data type)","level":2,"score":0.46549999713897705},{"id":"https://openalex.org/C51255310","wikidata":"https://www.wikidata.org/wiki/Q1163016","display_name":"Tensor product","level":2,"score":0.42809998989105225},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.4185999929904938},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.3749000132083893},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.37459999322891235},{"id":"https://openalex.org/C60798267","wikidata":"https://www.wikidata.org/wiki/Q1226939","display_name":"Division (mathematics)","level":2,"score":0.37049999833106995},{"id":"https://openalex.org/C85817219","wikidata":"https://www.wikidata.org/wiki/Q884772","display_name":"Block matrix","level":3,"score":0.36039999127388},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.35280001163482666},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.3434999883174896},{"id":"https://openalex.org/C62354387","wikidata":"https://www.wikidata.org/wiki/Q875399","display_name":"Boundary (topology)","level":2,"score":0.328900009393692},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.3050000071525574},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.2858000099658966},{"id":"https://openalex.org/C126312332","wikidata":"https://www.wikidata.org/wiki/Q2658","display_name":"Gaussian elimination","level":3,"score":0.2851000130176544},{"id":"https://openalex.org/C113315163","wikidata":"https://www.wikidata.org/wiki/Q7625159","display_name":"Structure tensor","level":3,"score":0.28130000829696655},{"id":"https://openalex.org/C113174947","wikidata":"https://www.wikidata.org/wiki/Q2859736","display_name":"Tree (set theory)","level":2,"score":0.2800999879837036},{"id":"https://openalex.org/C42355184","wikidata":"https://www.wikidata.org/wiki/Q1361088","display_name":"Matrix decomposition","level":3,"score":0.2800000011920929},{"id":"https://openalex.org/C2780365336","wikidata":"https://www.wikidata.org/wiki/Q25047934","display_name":"Single-core","level":2,"score":0.2727999985218048},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.2718000113964081},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.2662000060081482},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.25540000200271606}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3773656.3773678","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3773656.3773678","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3773656.3773678","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3773656.3773678","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W1884521333","https://openalex.org/W1968376936","https://openalex.org/W1992979775","https://openalex.org/W2018419001","https://openalex.org/W2031528426","https://openalex.org/W2041876368","https://openalex.org/W2087348355","https://openalex.org/W2088113364","https://openalex.org/W2141719776","https://openalex.org/W2160774872","https://openalex.org/W2166971959","https://openalex.org/W2188296733","https://openalex.org/W2282230507","https://openalex.org/W2612290557","https://openalex.org/W2622400073","https://openalex.org/W2767540596","https://openalex.org/W2775119255","https://openalex.org/W2782745921","https://openalex.org/W2791204867","https://openalex.org/W2886763336","https://openalex.org/W2886967310","https://openalex.org/W2941979784","https://openalex.org/W2986206952","https://openalex.org/W2995553094","https://openalex.org/W2999378407","https://openalex.org/W3009819974","https://openalex.org/W4291517469","https://openalex.org/W4302760152","https://openalex.org/W4362713195","https://openalex.org/W4410222576"],"related_works":[],"abstract_inverted_index":{"Recent":[0],"AI-oriented":[1,198],"accelerators,":[2],"such":[3],"as":[4,165,167],"Tensor":[5,28,93,133,190],"Cores":[6,29],"embedded":[7],"on":[8,139],"NVIDIA":[9,140],"GPUs,":[10],"provide":[11],"substantially":[12],"higher":[13],"throughput":[14],"for":[15,30,200],"matrix":[16,53,66,169],"operations":[17],"than":[18,156,163],"conventional":[19,69],"HPC":[20],"units.":[21],"This":[22],"paper":[23],"presents":[24],"strategies":[25],"to":[26,77,108,123,151],"exploit":[27],"accelerating":[31],"a":[32,44,51,59,64,157],"key":[33],"kernel":[34],"in":[35],"the":[36,41,128,145,183,189,195],"boundary":[37],"element":[38],"method":[39,184],"(BEM):":[40],"multiplication":[42],"of":[43,188,197],"Block":[45],"Low-Rank":[46],"(BLR)":[47],"matrix,":[48],"which":[49],"approximates":[50],"dense":[52,61,65,168],"through":[54],"blockwise":[55],"low-rank":[56],"representations,":[57],"by":[58,79],"tall\u2013skinny":[60],"matrix.":[62],"While":[63],"derived":[67],"from":[68],"BEM":[70],"requires":[71],"O(N2)":[72],"memory,":[73],"BLR":[74,159],"reduces":[75],"this":[76,98],"O(N1.5)":[78],"compressing":[80],"far-field":[81],"interactions,":[82],"but":[83],"its":[84],"irregular":[85],"block":[86,111],"sizes":[87],"and":[88,118,161,179],"varying":[89],"ranks":[90],"hinder":[91],"efficient":[92],"Core":[94,134],"utilization.":[95],"To":[96],"address":[97],"challenge,":[99],"we":[100],"introduce":[101],"Full-Depth":[102],"Tree":[103],"with":[104,114],"Equal":[105],"Division":[106],"(FDE)":[107],"generate":[109],"uniform":[110],"shapes":[112],"compatible":[113],"batched":[115],"GEMM":[116],"operations,":[117],"Sliced":[119],"Rank":[120],"Alignment":[121],"(SRA)":[122],"minimize":[124],"zero-padding":[125],"overhead":[126],"along":[127],"rank":[129],"dimension":[130],"while":[131,171],"satisfying":[132],"alignment":[135],"constraints.":[136],"Numerical":[137],"experiments":[138],"A100":[141],"GPUs":[142],"demonstrate":[143],"that":[144],"proposed":[146],"FDE+SRA":[147],"approach":[148],"achieves":[149],"up":[150],"three":[152],"times":[153],"faster":[154],"performance":[155],"straightforward":[158],"implementation":[160],"more":[162],"twice":[164],"fast":[166],"multiplication,":[170],"significantly":[172],"reducing":[173],"memory":[174],"usage.":[175],"For":[176],"large-scale":[177],"problems":[178],"multiple":[180],"righthand":[181],"sides,":[182],"reaches":[185],"nearly":[186],"75%":[187],"Core\u2019s":[191],"peak":[192],"performance,":[193],"demonstrating":[194],"effectiveness":[196],"hardware":[199],"BLR-based":[201],"computations.":[202]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-01-09T00:00:00"}
