{"id":"https://openalex.org/W4414198091","doi":"https://doi.org/10.1109/dac63849.2025.11132619","title":"VSpGEMM: Exploiting Versal ACAP for High-Performance SpGEMM Acceleration","display_name":"VSpGEMM: Exploiting Versal ACAP for High-Performance SpGEMM Acceleration","publication_year":2025,"publication_date":"2025-06-22","ids":{"openalex":"https://openalex.org/W4414198091","doi":"https://doi.org/10.1109/dac63849.2025.11132619"},"language":"en","primary_location":{"id":"doi:10.1109/dac63849.2025.11132619","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11132619","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101062141","display_name":"Kai Shi","orcid":"https://orcid.org/0009-0002-4059-9663"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Kai Shi","raw_affiliation_strings":["Beijing University of Posts and Telecommunications"],"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101564905","display_name":"Zhe Lin","orcid":"https://orcid.org/0009-0002-1594-2335"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhe Lin","raw_affiliation_strings":["Sun Yat-sen University"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054929311","display_name":"Xinya Luan","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinya Luan","raw_affiliation_strings":["Beijing University of Posts and Telecommunications"],"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016813457","display_name":"Jianwang Zhai","orcid":"https://orcid.org/0000-0002-1581-3536"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianwang Zhai","raw_affiliation_strings":["Beijing University of Posts and Telecommunications"],"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5027087200","display_name":"Kang Zhao","orcid":"https://orcid.org/0000-0003-0502-8523"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kang Zhao","raw_affiliation_strings":["Beijing University of Posts and Telecommunications"],"affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications","institution_ids":["https://openalex.org/I139759216"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101062141"],"corresponding_institution_ids":["https://openalex.org/I139759216"],"apc_list":null,"apc_paid":null,"fwci":1.4196,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.86264741,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9854999780654907,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9854999780654907,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10346","display_name":"Magnetic confinement fusion research","score":0.9797999858856201,"subfield":{"id":"https://openalex.org/subfields/3106","display_name":"Nuclear and High Energy Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9732999801635742,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5694000124931335},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5572999715805054},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.47839999198913574},{"id":"https://openalex.org/keywords/matrix-representation","display_name":"Matrix representation","score":0.4778999984264374},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.47429999709129333},{"id":"https://openalex.org/keywords/preprocessor","display_name":"Preprocessor","score":0.4715000092983246},{"id":"https://openalex.org/keywords/acceleration","display_name":"Acceleration","score":0.45680001378059387},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4415000081062317},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.4404999911785126}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8098999857902527},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5694000124931335},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5572999715805054},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5134000182151794},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.47839999198913574},{"id":"https://openalex.org/C103275481","wikidata":"https://www.wikidata.org/wiki/Q6787889","display_name":"Matrix representation","level":3,"score":0.4778999984264374},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.47429999709129333},{"id":"https://openalex.org/C34736171","wikidata":"https://www.wikidata.org/wiki/Q918333","display_name":"Preprocessor","level":2,"score":0.4715000092983246},{"id":"https://openalex.org/C117896860","wikidata":"https://www.wikidata.org/wiki/Q11376","display_name":"Acceleration","level":2,"score":0.45680001378059387},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4415000081062317},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.4404999911785126},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.42239999771118164},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.39959999918937683},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.39320001006126404},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.39239999651908875},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.3905999958515167},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.373199999332428},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3504999876022339},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3467000126838684},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.32409998774528503},{"id":"https://openalex.org/C47487241","wikidata":"https://www.wikidata.org/wiki/Q5227230","display_name":"Data access","level":2,"score":0.3239000141620636},{"id":"https://openalex.org/C186370098","wikidata":"https://www.wikidata.org/wiki/Q442787","display_name":"Energy (signal processing)","level":2,"score":0.3127000033855438},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.3091999888420105},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.30469998717308044},{"id":"https://openalex.org/C90673727","wikidata":"https://www.wikidata.org/wiki/Q901718","display_name":"Product (mathematics)","level":2,"score":0.3043000102043152},{"id":"https://openalex.org/C116409475","wikidata":"https://www.wikidata.org/wiki/Q1385056","display_name":"External Data Representation","level":2,"score":0.29899999499320984},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.2937000095844269},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.2921999990940094},{"id":"https://openalex.org/C194739806","wikidata":"https://www.wikidata.org/wiki/Q66221","display_name":"Computer data storage","level":2,"score":0.2809000015258789},{"id":"https://openalex.org/C13164978","wikidata":"https://www.wikidata.org/wiki/Q600158","display_name":"Hardware acceleration","level":3,"score":0.2720000147819519},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2687999904155731},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.2558000087738037},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.25529998540878296},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.2533000111579895},{"id":"https://openalex.org/C86111242","wikidata":"https://www.wikidata.org/wiki/Q859595","display_name":"Coprocessor","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/dac63849.2025.11132619","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11132619","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":17,"referenced_works":["https://openalex.org/W784772322","https://openalex.org/W1980282429","https://openalex.org/W2035080386","https://openalex.org/W2089437293","https://openalex.org/W2612148068","https://openalex.org/W2751658790","https://openalex.org/W2914086833","https://openalex.org/W3000305333","https://openalex.org/W3041191963","https://openalex.org/W3175878603","https://openalex.org/W3198975860","https://openalex.org/W4220912491","https://openalex.org/W4293084046","https://openalex.org/W4386709563","https://openalex.org/W4391468153","https://openalex.org/W4401331008","https://openalex.org/W4401753418"],"related_works":[],"abstract_inverted_index":{"Sparse":[0],"general":[1,19],"matrix-matrix":[2],"multiplication":[3],"(SpGEMM)":[4],"serves":[5],"as":[6,14],"a":[7,31,68,82,94,120,145,152,186],"fundamental":[8],"operation":[9],"in":[10,23,53,90,144,203],"real-world":[11],"applications":[12],"such":[13],"deep":[15],"learning.":[16],"Different":[17],"from":[18],"matrix":[20],"multiplication,":[21],"matrices":[22],"SpGEMM":[24,72,130],"are":[25],"highly":[26],"sparse":[27],"and":[28,42,44,58,76,96,105,115,163,180,197],"therefore":[29],"require":[30],"compact":[32],"representation.":[33],"This":[34],"places":[35],"an":[36,198],"additional":[37],"burden":[38],"on":[39,78,142,195,209],"data":[40,117,149],"preprocessing":[41],"exchanging":[43],"also":[45],"causes":[46],"irregular":[47],"memory":[48,113,178],"access":[49],"patterns,":[50],"which":[51,92,171],"can":[52],"turn":[54],"lead":[55],"to":[56,100,126,158,207],"communication":[57,181],"computation":[59,133],"bottlenecks.":[60],"To":[61],"break":[62],"these":[63],"bottlenecks,":[64],"we":[65],"present":[66],"VSpGEMM,":[67,91],"hardware":[69],"accelerator":[70],"for":[71,167],"that":[73,135],"is":[74,88,124,156],"tailored":[75],"optimized":[77],"Versal":[79,143,196],"ACAP.":[80],"Firstly,":[81],"new":[83],"storage":[84],"format":[85],"called":[86],"BCSX":[87],"proposed":[89],"offers":[93],"unified":[95],"block-wise":[97],"compression":[98],"strategy":[99],"deal":[101],"with":[102],"both":[103,160],"row-major":[104],"columnmajor":[106],"representation":[107],"of":[108,176,216],"non-zero":[109],"data,":[110],"enabling":[111],"fixed-pattern":[112],"accesses":[114],"effective":[116],"preloading.":[118],"Secondly,":[119],"multi-level":[121],"tiling":[122],"mechanism":[123],"introduced":[125],"decompose":[127],"the":[128,138,161,174,214],"holistic":[129],"into":[131,137],"multiple":[132],"granularities":[134],"fit":[136],"AI":[139],"Engines":[140],"(AIEs)":[141],"hierarchical":[146],"manner,":[147],"enhancing":[148],"reuse.":[150],"Thirdly,":[151],"hybrid":[153],"partitioning":[154],"scheme":[155],"presented":[157],"orchestrate":[159],"AIEs":[162],"programmable":[164],"logic":[165],"(PL)":[166],"intermediate":[168],"product":[169],"merging,":[170],"together":[172],"resolve":[173],"issues":[175],"high":[177],"utilization":[179],"demand.":[182],"Experimental":[183],"results":[184],"demonstrate":[185],"$2.65":[187],"\\times$":[188,201],"speedup":[189],"over":[190],"state-of-the-art":[191],"(SOTA)":[192],"GEMM":[193],"design":[194],"average":[199],"$33.62":[200],"improvement":[202],"energy":[204],"efficiency":[205],"compared":[206],"cuSPARSE":[208],"RTX":[210],"4090":[211],"GPU,":[212],"showing":[213],"efficacy":[215],"VSpGEMM.":[217]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
