{"id":"https://openalex.org/W7133509720","doi":"https://doi.org/10.1109/hpca68181.2026.11408479","title":"Swift: High-Performance Sparse-Dense Matrix Multiplication on GPUs","display_name":"Swift: High-Performance Sparse-Dense Matrix Multiplication on GPUs","publication_year":2026,"publication_date":"2026-01-31","ids":{"openalex":"https://openalex.org/W7133509720","doi":"https://doi.org/10.1109/hpca68181.2026.11408479"},"language":null,"primary_location":{"id":"doi:10.1109/hpca68181.2026.11408479","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca68181.2026.11408479","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5070174646","display_name":"Jinyu Hu","orcid":"https://orcid.org/0009-0007-8422-2901"},"institutions":[{"id":"https://openalex.org/I16609230","display_name":"Hunan University","ror":"https://ror.org/05htk5m33","country_code":"CN","type":"education","lineage":["https://openalex.org/I16609230"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jinyu Hu","raw_affiliation_strings":["Hunan University"],"affiliations":[{"raw_affiliation_string":"Hunan University","institution_ids":["https://openalex.org/I16609230"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082012083","display_name":"Huizhang Luo","orcid":"https://orcid.org/0000-0003-2392-0267"},"institutions":[{"id":"https://openalex.org/I16609230","display_name":"Hunan University","ror":"https://ror.org/05htk5m33","country_code":"CN","type":"education","lineage":["https://openalex.org/I16609230"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huizhang Luo","raw_affiliation_strings":["Hunan University"],"affiliations":[{"raw_affiliation_string":"Hunan University","institution_ids":["https://openalex.org/I16609230"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074141177","display_name":"Hong Min Jiang","orcid":null},"institutions":[{"id":"https://openalex.org/I189196454","display_name":"The University of Texas at Arlington","ror":"https://ror.org/019kgqr73","country_code":"US","type":"education","lineage":["https://openalex.org/I189196454"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hong Jiang","raw_affiliation_strings":["UT Arlington"],"affiliations":[{"raw_affiliation_string":"UT Arlington","institution_ids":["https://openalex.org/I189196454"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120898093","display_name":"Marc Casas","orcid":null},"institutions":[{"id":"https://openalex.org/I2799803557","display_name":"Barcelona Supercomputing Center","ror":"https://ror.org/05sd8tv96","country_code":"ES","type":"facility","lineage":["https://openalex.org/I2799803557","https://openalex.org/I9617848"]},{"id":"https://openalex.org/I9617848","display_name":"Universitat Polit\u00e8cnica de Catalunya","ror":"https://ror.org/03mb6wj31","country_code":"ES","type":"education","lineage":["https://openalex.org/I9617848"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Marc Casas","raw_affiliation_strings":["Barcelona Supercomputing Center"],"affiliations":[{"raw_affiliation_string":"Barcelona Supercomputing Center","institution_ids":["https://openalex.org/I9617848","https://openalex.org/I2799803557"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128058774","display_name":"Kenli Li","orcid":null},"institutions":[{"id":"https://openalex.org/I16609230","display_name":"Hunan University","ror":"https://ror.org/05htk5m33","country_code":"CN","type":"education","lineage":["https://openalex.org/I16609230"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kenli Li","raw_affiliation_strings":["Hunan University"],"affiliations":[{"raw_affiliation_string":"Hunan University","institution_ids":["https://openalex.org/I16609230"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5126604874","display_name":"Chubo Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I16609230","display_name":"Hunan University","ror":"https://ror.org/05htk5m33","country_code":"CN","type":"education","lineage":["https://openalex.org/I16609230"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chubo Liu","raw_affiliation_strings":["Hunan University"],"affiliations":[{"raw_affiliation_string":"Hunan University","institution_ids":["https://openalex.org/I16609230"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5070174646"],"corresponding_institution_ids":["https://openalex.org/I16609230"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.91239455,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"16"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.64410001039505,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.64410001039505,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.10540000349283218,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11697","display_name":"Numerical Methods and Algorithms","score":0.03150000050663948,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.3587999939918518},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.33730000257492065},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.2897000014781952},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.2809000015258789}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5364000201225281},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.3587999939918518},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.35850000381469727},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.33730000257492065},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2937000095844269},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.2897000014781952},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.2809000015258789},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.2696000039577484},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.23999999463558197},{"id":"https://openalex.org/C64543145","wikidata":"https://www.wikidata.org/wiki/Q162942","display_name":"Intersection (aeronautics)","level":2,"score":0.23749999701976776}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpca68181.2026.11408479","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca68181.2026.11408479","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1935978687","https://openalex.org/W1989562524","https://openalex.org/W1996615911","https://openalex.org/W2034050544","https://openalex.org/W2035080386","https://openalex.org/W2040404421","https://openalex.org/W2070232376","https://openalex.org/W2088866486","https://openalex.org/W2111667319","https://openalex.org/W2134237243","https://openalex.org/W2785844383","https://openalex.org/W2808128431","https://openalex.org/W2911544077","https://openalex.org/W2913983209","https://openalex.org/W2914631005","https://openalex.org/W2964015378","https://openalex.org/W3130660608","https://openalex.org/W3132695675","https://openalex.org/W3208099998","https://openalex.org/W4246740707","https://openalex.org/W4285104934","https://openalex.org/W4293024985","https://openalex.org/W4308620576","https://openalex.org/W4321636621","https://openalex.org/W4378804724","https://openalex.org/W4378804782","https://openalex.org/W4380875466","https://openalex.org/W4386709686","https://openalex.org/W4390097706","https://openalex.org/W4391925833","https://openalex.org/W4391986945","https://openalex.org/W4393146752","https://openalex.org/W4393407064","https://openalex.org/W4394998401","https://openalex.org/W4399424818","https://openalex.org/W4401895034","https://openalex.org/W4401945586"],"related_works":[],"abstract_inverted_index":{"Sparse-Dense":[0],"Matrix":[1,237],"Multiplication":[2],"(SpMM)":[3],"on":[4,33,48,117,150,178,245],"GPUs":[5,23],"has":[6,54],"gained":[7],"significant":[8],"attention":[9,53],"because":[10],"of":[11,22,36,46,60,68,81,85,105,142,148,167,173,181,222],"its":[12],"importance":[13,35],"in":[14,24,64],"modern":[15,151],"applications":[16],"and":[17,39,108,115,145,171,184,192,196,214,231,251],"the":[18,25,34,43,58,66,86,112,140,165,168,174,179,186,190,200,212,234,240,258],"increasing":[19],"computing":[20],"power":[21],"last":[26],"decade.":[27],"Previous":[28],"SpMM":[29,47,149],"studies":[30],"have":[31],"focused":[32],"storage":[37],"format":[38,204],"load":[40,187],"balance":[41],"for":[42],"overall":[44,87],"performance":[45],"GPUs.":[49,118,152],"However,":[50],"very":[51],"little":[52],"been":[55],"paid":[56],"to":[57,78,90,99,210],"efficacy":[59],"coalesced":[61,102,154],"memory":[62,103,114,155],"access":[63,104],"improving":[65],"efficiency":[67,161],"data":[69],"loading,":[70],"which":[71],"incurs":[72],"a":[73,219],"notable":[74],"overhead":[75],"that":[76,137,254],"amounts":[77],"an":[79,124,206],"average":[80],"more":[82],"than":[83],"32%":[84],"performance,":[88],"according":[89],"our":[91,255],"experimental":[92,243],"observation.":[93],"Existing":[94],"state-of-the-art":[95],"(SOTA)":[96],"solutions":[97],"fail":[98],"adequately":[100],"support":[101],"both":[106,143,164],"sparse":[107,144,169],"dense":[109,146,175],"matrices":[110,147],"between":[111],"global":[113],"threads":[116],"In":[119],"this":[120],"paper,":[121],"we":[122],"propose":[123],"efficient":[125],"algorithm":[126],"called":[127],"Swift.<sup":[128],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[129,131],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup><sup":[130],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>Swift":[132],"is":[133],"available":[134],"at":[135],"https://github.com/MinttHu/Swift.git":[136],"speeds":[138],"up":[139],"loading":[141,160],"Leveraging":[153],"access,":[156],"Swift":[157,198,223],"achieves":[158],"high":[159],"by":[162,188],"sorting":[163],"columns":[166],"matrix":[170,176],"elements":[172,183],"based":[177],"number":[180],"non-zero":[182],"balancing":[185],"handling":[189],"regular":[191],"irregular":[193],"parts":[194],"differently":[195],"judiciously.":[197],"takes":[199],"Compressed":[201],"Sparse":[202],"Column":[203],"as":[205,239],"implementation":[207],"case":[208],"study":[209],"prove":[211],"concept":[213],"gain":[215],"insights.":[216],"We":[217],"conduct":[218],"comprehensive":[220],"comparison":[221],"with":[224],"four":[225],"SOTA":[226],"solutions:":[227],"ASpT,":[228],"cuSPARSE,":[229],"RoDe,":[230],"Sputnik,":[232],"using":[233],"full":[235],"SuiteSparse":[236],"Collection":[238],"workload.":[241],"The":[242],"results":[244],"RTX":[246,248],"4080s,":[247],"3090Ti,":[249],"A100,":[250],"V100":[252],"demonstrate":[253],"method":[256],"outperforms":[257],"baselines":[259],"significantly.":[260]},"counts_by_year":[],"updated_date":"2026-03-06T06:45:51.903784","created_date":"2026-03-05T00:00:00"}
