{"id":"https://openalex.org/W7160285619","doi":"https://doi.org/10.1109/lca.2026.3690418","title":"Near-HBM Tensor Core Acceleration for Fine-Grained Sparse Matrix-Matrix Multiplication","display_name":"Near-HBM Tensor Core Acceleration for Fine-Grained Sparse Matrix-Matrix Multiplication","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7160285619","doi":"https://doi.org/10.1109/lca.2026.3690418"},"language":null,"primary_location":{"id":"doi:10.1109/lca.2026.3690418","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lca.2026.3690418","pdf_url":null,"source":{"id":"https://openalex.org/S17643076","display_name":"IEEE Computer Architecture Letters","issn_l":"1556-6056","issn":["1556-6056","1556-6064","2473-2575"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Computer Architecture Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135336032","display_name":"Lihan Hu","orcid":"https://orcid.org/0009-0008-7152-7842"},"institutions":[{"id":"https://openalex.org/I10654025","display_name":"SK Group (United States)","ror":"https://ror.org/00qajw440","country_code":"US","type":"company","lineage":["https://openalex.org/I10654025","https://openalex.org/I134353371"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lihan Hu","raw_affiliation_strings":["SK Hynix America, San Jose, CA, USA"],"raw_orcid":"https://orcid.org/0009-0008-7152-7842","affiliations":[{"raw_affiliation_string":"SK Hynix America, San Jose, CA, USA","institution_ids":["https://openalex.org/I10654025"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042236070","display_name":"Shiju Li","orcid":"https://orcid.org/0000-0003-3145-6329"},"institutions":[{"id":"https://openalex.org/I10654025","display_name":"SK Group (United States)","ror":"https://ror.org/00qajw440","country_code":"US","type":"company","lineage":["https://openalex.org/I10654025","https://openalex.org/I134353371"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shiju Li","raw_affiliation_strings":["SK Hynix America, San Jose, CA, USA"],"raw_orcid":"https://orcid.org/0009-0009-9755-9591","affiliations":[{"raw_affiliation_string":"SK Hynix America, San Jose, CA, USA","institution_ids":["https://openalex.org/I10654025"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008944433","display_name":"Hoshik Kim","orcid":"https://orcid.org/0000-0002-4017-8124"},"institutions":[{"id":"https://openalex.org/I10654025","display_name":"SK Group (United States)","ror":"https://ror.org/00qajw440","country_code":"US","type":"company","lineage":["https://openalex.org/I10654025","https://openalex.org/I134353371"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hoshik Kim","raw_affiliation_strings":["SK Hynix America, San Jose, CA, USA"],"raw_orcid":"https://orcid.org/0000-0002-4017-8124","affiliations":[{"raw_affiliation_string":"SK Hynix America, San Jose, CA, USA","institution_ids":["https://openalex.org/I10654025"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5018146250","display_name":"Jongryool Kim","orcid":"https://orcid.org/0009-0006-5938-6878"},"institutions":[{"id":"https://openalex.org/I10654025","display_name":"SK Group (United States)","ror":"https://ror.org/00qajw440","country_code":"US","type":"company","lineage":["https://openalex.org/I10654025","https://openalex.org/I134353371"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jongryool Kim","raw_affiliation_strings":["SK Hynix America, San Jose, CA, USA"],"raw_orcid":"https://orcid.org/0009-0006-5938-6878","affiliations":[{"raw_affiliation_string":"SK Hynix America, San Jose, CA, USA","institution_ids":["https://openalex.org/I10654025"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.70784281,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"25","issue":"1","first_page":"186","last_page":"189"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.3880000114440918,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.3880000114440918,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.2957000136375427,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.07890000194311142,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6796000003814697},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.6137999892234802},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.6134999990463257},{"id":"https://openalex.org/keywords/locality","display_name":"Locality","score":0.5878000259399414},{"id":"https://openalex.org/keywords/multi-core-processor","display_name":"Multi-core processor","score":0.5776000022888184},{"id":"https://openalex.org/keywords/sparse-matrix","display_name":"Sparse matrix","score":0.5702000260353088},{"id":"https://openalex.org/keywords/in-memory-processing","display_name":"In-Memory Processing","score":0.5055999755859375},{"id":"https://openalex.org/keywords/acceleration","display_name":"Acceleration","score":0.48899999260902405},{"id":"https://openalex.org/keywords/memory-management","display_name":"Memory management","score":0.4489000141620636}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.786300003528595},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6796000003814697},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6427000164985657},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.6137999892234802},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.6134999990463257},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.5878000259399414},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.5776000022888184},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.5702000260353088},{"id":"https://openalex.org/C123593499","wikidata":"https://www.wikidata.org/wiki/Q6008583","display_name":"In-Memory Processing","level":5,"score":0.5055999755859375},{"id":"https://openalex.org/C117896860","wikidata":"https://www.wikidata.org/wiki/Q11376","display_name":"Acceleration","level":2,"score":0.48899999260902405},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.4489000141620636},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.43389999866485596},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4221000075340271},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.40639999508857727},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.3959999978542328},{"id":"https://openalex.org/C151201525","wikidata":"https://www.wikidata.org/wiki/Q177239","display_name":"Limit (mathematics)","level":2,"score":0.3944999873638153},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.391400009393692},{"id":"https://openalex.org/C79470037","wikidata":"https://www.wikidata.org/wiki/Q279748","display_name":"Out-of-core algorithm","level":2,"score":0.3774000108242035},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.37470000982284546},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.32249999046325684},{"id":"https://openalex.org/C106516650","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm design","level":2,"score":0.27639999985694885},{"id":"https://openalex.org/C91481028","wikidata":"https://www.wikidata.org/wiki/Q1054686","display_name":"Distributed memory","level":3,"score":0.27090001106262207},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.2685000002384186},{"id":"https://openalex.org/C168781493","wikidata":"https://www.wikidata.org/wiki/Q80585","display_name":"Associative array","level":2,"score":0.2685000002384186},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.26440000534057617},{"id":"https://openalex.org/C133875982","wikidata":"https://www.wikidata.org/wiki/Q764810","display_name":"Shared memory","level":2,"score":0.25699999928474426},{"id":"https://openalex.org/C106515295","wikidata":"https://www.wikidata.org/wiki/Q26806595","display_name":"Parallel processing","level":2,"score":0.2547999918460846},{"id":"https://openalex.org/C124007464","wikidata":"https://www.wikidata.org/wiki/Q428091","display_name":"Tensor contraction","level":3,"score":0.2533000111579895},{"id":"https://openalex.org/C124851039","wikidata":"https://www.wikidata.org/wiki/Q2665459","display_name":"Compressed sensing","level":2,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lca.2026.3690418","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lca.2026.3690418","pdf_url":null,"source":{"id":"https://openalex.org/S17643076","display_name":"IEEE Computer Architecture Letters","issn_l":"1556-6056","issn":["1556-6056","1556-6064","2473-2575"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Computer Architecture Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Fine-grained":[0],"structured":[1,65,127],"sparsity,":[2],"especially":[3],"the":[4,13,40,98],"2:4":[5],"pattern,":[6],"is":[7],"a":[8,60,123],"practical":[9],"way":[10],"to":[11,75],"reduce":[12],"computation":[14],"and":[15,46,54,84,107,118],"memory":[16,48,108,120],"cost":[17],"of":[18,126],"deep":[19],"learning":[20],"workloads":[21],"while":[22],"preserving":[23],"model":[24],"accuracy.":[25],"NVIDIA":[26],"Sparse":[27],"Tensor":[28,68,104],"Cores":[29],"support":[30],"such":[31],"sparsity":[32],"through":[33],"metadata-driven":[34,72],"sparse":[35,42,128],"matrix-matrix":[36],"multiplication":[37],"(SpMM).":[38],"However,":[39],"compressed":[41],"format":[43],"introduces":[44],"indirect":[45,95],"irregular":[47],"accesses,":[49],"which":[50],"disrupt":[51],"data":[52,73],"locality":[53],"limit":[55],"performance.":[56],"We":[57],"propose":[58],"TC-AIA,":[59],"processing-near-HBM":[61],"framework":[62],"for":[63],"accelerating":[64],"SpMM":[66],"on":[67],"Cores.":[69],"TC-AIA":[70,102,114],"offloads":[71],"gathering":[74],"logic":[76],"near":[77],"HBM,":[78],"where":[79],"scattered":[80],"accesses":[81,96],"are":[82],"reorganized":[83],"packed":[85],"into":[86],"dense":[87],"Tensor-Core-compatible":[88],"inputs":[89],"before":[90],"computation.":[91],"By":[92],"removing":[93],"most":[94],"from":[97],"GPU-side":[99],"execution":[100],"path,":[101],"improves":[103],"Core":[105],"utilization":[106],"efficiency.":[109],"Experimental":[110],"results":[111],"show":[112],"that":[113],"achieves":[115],"significant":[116],"speedup":[117],"reduces":[119],"traffic":[121],"across":[122],"wide":[124],"range":[125],"workloads.":[129]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-06T00:00:00"}
