{"id":"https://openalex.org/W4410321937","doi":"https://doi.org/10.1145/3725798.3725803","title":"Can Tensor Cores Benefit Memory-Bound Kernels? (NO!)","display_name":"Can Tensor Cores Benefit Memory-Bound Kernels? (NO!)","publication_year":2025,"publication_date":"2025-03-01","ids":{"openalex":"https://openalex.org/W4410321937","doi":"https://doi.org/10.1145/3725798.3725803"},"language":"en","primary_location":{"id":"doi:10.1145/3725798.3725803","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3725798.3725803","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3725798.3725803","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 17th Workshop on General Purpose Processing Using GPU","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3725798.3725803","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5078250463","display_name":"Lingqi Zhang","orcid":"https://orcid.org/0000-0002-2452-1551"},"institutions":[{"id":"https://openalex.org/I4210129730","display_name":"RIKEN Center for Computational Science","ror":"https://ror.org/03r519674","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210129730"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Lingqi Zhang","raw_affiliation_strings":["RIKEN Center for Computational Science, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"RIKEN Center for Computational Science, Tokyo, Japan","institution_ids":["https://openalex.org/I4210129730"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100779661","display_name":"Jiajun Huang","orcid":"https://orcid.org/0000-0001-5092-3987"},"institutions":[{"id":"https://openalex.org/I103635307","display_name":"University of California, Riverside","ror":"https://ror.org/03nawhv43","country_code":"US","type":"education","lineage":["https://openalex.org/I103635307"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jiajun Huang","raw_affiliation_strings":["University of California, Riverside, Riverside, California, USA"],"affiliations":[{"raw_affiliation_string":"University of California, Riverside, Riverside, California, USA","institution_ids":["https://openalex.org/I103635307"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103124363","display_name":"Sheng Di","orcid":"https://orcid.org/0000-0002-9935-5674"},"institutions":[{"id":"https://openalex.org/I1282105669","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282105669","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sheng Di","raw_affiliation_strings":["Argonne National Laboratory, Lemont, Illinois, USA"],"affiliations":[{"raw_affiliation_string":"Argonne National Laboratory, Lemont, Illinois, USA","institution_ids":["https://openalex.org/I1282105669"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100634486","display_name":"Satoshi Matsuoka","orcid":"https://orcid.org/0000-0003-1910-8532"},"institutions":[{"id":"https://openalex.org/I4210129730","display_name":"RIKEN Center for Computational Science","ror":"https://ror.org/03r519674","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210129730"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Satoshi Matsuoka","raw_affiliation_strings":["RIKEN Center for Computational Science, Kobe, Japan"],"affiliations":[{"raw_affiliation_string":"RIKEN Center for Computational Science, Kobe, Japan","institution_ids":["https://openalex.org/I4210129730"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5002208999","display_name":"Mohamed Wahib","orcid":"https://orcid.org/0000-0002-7165-2095"},"institutions":[{"id":"https://openalex.org/I4210129730","display_name":"RIKEN Center for Computational Science","ror":"https://ror.org/03r519674","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210129730"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Mohamed Wahib","raw_affiliation_strings":["RIKEN Center for Computational Science, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"RIKEN Center for Computational Science, Tokyo, Japan","institution_ids":["https://openalex.org/I4210129730"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5078250463"],"corresponding_institution_ids":["https://openalex.org/I4210129730"],"apc_list":null,"apc_paid":null,"fwci":4.3478,"has_fulltext":true,"cited_by_count":2,"citation_normalized_percentile":{"value":0.92919075,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"28","last_page":"34"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13650","display_name":"Computational Physics and Python Applications","score":0.9944000244140625,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10481","display_name":"Computer Graphics and Visualization Techniques","score":0.9807000160217285,"subfield":{"id":"https://openalex.org/subfields/1704","display_name":"Computer Graphics and Computer-Aided Design"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5943500995635986},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.5607922673225403},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.41022229194641113},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.21762949228286743},{"id":"https://openalex.org/keywords/pure-mathematics","display_name":"Pure mathematics","score":0.1372053027153015}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5943500995635986},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.5607922673225403},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.41022229194641113},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.21762949228286743},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.1372053027153015}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3725798.3725803","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3725798.3725803","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3725798.3725803","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 17th Workshop on General Purpose Processing Using GPU","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3725798.3725803","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3725798.3725803","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3725798.3725803","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 17th Workshop on General Purpose Processing Using GPU","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1116540683","display_name":null,"funder_award_id":"DE-AC02-06CH11","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G1751644051","display_name":null,"funder_award_id":"DE-AC02-06CH11357","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G1844751952","display_name":null,"funder_award_id":"DE-AC02_06CH11357","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G3000213308","display_name":null,"funder_award_id":"DE-AC02-06CH113","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G3801222974","display_name":null,"funder_award_id":"DE-AC02-06CH1135","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G498139845","display_name":null,"funder_award_id":"DE-AC02","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G650626725","display_name":null,"funder_award_id":"E-AC02-06CH11357","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G6918803902","display_name":null,"funder_award_id":"06CH11357","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G8143874970","display_name":null,"funder_award_id":"AC02-06CH11357","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"}],"funders":[{"id":"https://openalex.org/F4320332359","display_name":"Office of Science","ror":"https://ror.org/00mmn6b08"},{"id":"https://openalex.org/F4320337506","display_name":"Advanced Scientific Computing Research","ror":"https://ror.org/0012c7r22"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4410321937.pdf","grobid_xml":"https://content.openalex.org/works/W4410321937.grobid-xml"},"referenced_works_count":32,"referenced_works":["https://openalex.org/W2002555321","https://openalex.org/W2035080386","https://openalex.org/W2049009664","https://openalex.org/W2101511474","https://openalex.org/W2315715336","https://openalex.org/W2626696598","https://openalex.org/W2772612468","https://openalex.org/W2798724095","https://openalex.org/W2895305554","https://openalex.org/W2983865192","https://openalex.org/W2984920043","https://openalex.org/W2996929894","https://openalex.org/W3000120212","https://openalex.org/W3044913359","https://openalex.org/W3099814709","https://openalex.org/W3174843332","https://openalex.org/W3175644086","https://openalex.org/W3205883294","https://openalex.org/W4221160294","https://openalex.org/W4308090436","https://openalex.org/W4309672181","https://openalex.org/W4367684987","https://openalex.org/W4376632753","https://openalex.org/W4388661983","https://openalex.org/W4391987273","https://openalex.org/W4392884779","https://openalex.org/W4395106472","https://openalex.org/W4399282204","https://openalex.org/W4400409880","https://openalex.org/W4405755887","https://openalex.org/W4405756173","https://openalex.org/W4405756205"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Tensor":[0],"cores":[1,53,58,88,99,135],"are":[2],"specialized":[3],"processing":[4],"units":[5],"within":[6],"GPUs":[7],"that":[8,51,86,129],"have":[9,30,49],"demonstrated":[10],"significant":[11],"efficiency":[12],"gains":[13],"in":[14,103],"compute-bound":[15],"applications":[16],"such":[17],"as":[18],"Deep":[19],"Learning":[20],"Training":[21],"by":[22],"accelerating":[23],"dense":[24,38],"matrix":[25,39],"operations.Given":[26],"their":[27],"success,":[28],"researchers":[29],"attempted":[31],"to":[32,41],"extend":[33],"tensor":[34,52,87,134],"core":[35],"capabilities":[36],"beyond":[37],"computations":[40],"other":[42],"computational":[43],"patterns,":[44],"including":[45],"memory-bound":[46,61,101,122,131],"kernels.Recent":[47],"studies":[48],"reported":[50],"can":[54,89],"outperform":[55],"traditional":[56],"CUDA":[57,98,143],"even":[59],"on":[60],"kernels,":[62],"where":[63],"the":[64],"primary":[65],"performance":[66,140],"bottleneck":[67],"is":[68],"not":[69,137],"computation.In":[70],"this":[71,113],"research,":[72],"we":[73],"challenge":[74],"these":[75],"findings":[76],"through":[77,116],"both":[78],"theoretical":[79,83,114],"and":[80,109,126],"empirical":[81,117],"analysis.Our":[82],"analysis":[84,118],"reveals":[85],"achieve":[90],"a":[91],"maximum":[92],"speedup":[93],"of":[94,119],"only":[95],"1.33":[96],"over":[97,142],"for":[100],"kernels":[102,132],"double":[104],"precision":[105],"(for":[106],"V100,":[107],"A100,":[108],"H100":[110],"GPUs).We":[111],"validate":[112],"limit":[115],"three":[120],"representative":[121],"kernels-STREAM":[123],"Scale,":[124],"SpMV,":[125],"stencil.We":[127],"demonstrate":[128],"optimizing":[130],"using":[133],"does":[136],"yield":[138],"sound":[139],"improvements":[141],"cores.":[144]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2026-04-11T08:14:18.477133","created_date":"2025-10-10T00:00:00"}
