{"id":"https://openalex.org/W4385623280","doi":"https://doi.org/10.1145/3588195.3595936","title":"Accelerating Sparse General Matrix-Matrix Multiplication for NVIDIA Volta GPU and Hygon DCU","display_name":"Accelerating Sparse General Matrix-Matrix Multiplication for NVIDIA Volta GPU and Hygon DCU","publication_year":2023,"publication_date":"2023-08-07","ids":{"openalex":"https://openalex.org/W4385623280","doi":"https://doi.org/10.1145/3588195.3595936"},"language":"en","primary_location":{"id":"doi:10.1145/3588195.3595936","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3588195.3595936","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd International Symposium on High-Performance Parallel and Distributed Computing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101651218","display_name":"Z. F. Tian","orcid":"https://orcid.org/0000-0001-8927-4099"},"institutions":[{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhuo Tian","raw_affiliation_strings":["Institute of Software Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Software Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210128818","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101826024","display_name":"Shuai Yang","orcid":"https://orcid.org/0009-0004-5444-7104"},"institutions":[{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuai Yang","raw_affiliation_strings":["Institute of Software Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Software Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210128818","https://openalex.org/I19820366"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103068794","display_name":"Changyou Zhang","orcid":"https://orcid.org/0000-0003-4025-0736"},"institutions":[{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Changyou Zhang","raw_affiliation_strings":["Institute of Software Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Software Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210128818","https://openalex.org/I19820366"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5101651218"],"corresponding_institution_ids":["https://openalex.org/I19820366","https://openalex.org/I4210128818"],"apc_list":null,"apc_paid":null,"fwci":0.6181,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.62154523,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"329","last_page":"330"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8530914783477783},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7136540412902832},{"id":"https://openalex.org/keywords/sparse-matrix","display_name":"Sparse matrix","score":0.6458276510238647},{"id":"https://openalex.org/keywords/graphics","display_name":"Graphics","score":0.6290591955184937},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.6127225160598755},{"id":"https://openalex.org/keywords/memory-hierarchy","display_name":"Memory hierarchy","score":0.6019829511642456},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.5249457359313965},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.4949480891227722},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.463352233171463},{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.453469842672348},{"id":"https://openalex.org/keywords/memory-management","display_name":"Memory management","score":0.41692647337913513},{"id":"https://openalex.org/keywords/row","display_name":"Row","score":0.4164765477180481},{"id":"https://openalex.org/keywords/computer-hardware","display_name":"Computer hardware","score":0.26472389698028564},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.17075565457344055},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.14092659950256348},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.0946049690246582},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.09427580237388611}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8530914783477783},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7136540412902832},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.6458276510238647},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.6290591955184937},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.6127225160598755},{"id":"https://openalex.org/C2778100165","wikidata":"https://www.wikidata.org/wiki/Q1589327","display_name":"Memory hierarchy","level":3,"score":0.6019829511642456},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.5249457359313965},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.4949480891227722},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.463352233171463},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.453469842672348},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.41692647337913513},{"id":"https://openalex.org/C135598885","wikidata":"https://www.wikidata.org/wiki/Q1366302","display_name":"Row","level":2,"score":0.4164765477180481},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.26472389698028564},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.17075565457344055},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.14092659950256348},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0946049690246582},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.09427580237388611},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.0},{"id":"https://openalex.org/C98986596","wikidata":"https://www.wikidata.org/wiki/Q1143031","display_name":"Semiconductor memory","level":2,"score":0.0},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3588195.3595936","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3588195.3595936","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd International Symposium on High-Performance Parallel and Distributed Computing","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":3,"referenced_works":["https://openalex.org/W2101320423","https://openalex.org/W2985041356","https://openalex.org/W4200089575"],"related_works":["https://openalex.org/W2293771254","https://openalex.org/W2128546436","https://openalex.org/W3121828480","https://openalex.org/W2039875226","https://openalex.org/W4221142455","https://openalex.org/W2914631005","https://openalex.org/W2123154672","https://openalex.org/W2032786851","https://openalex.org/W2972717823","https://openalex.org/W2952630098"],"abstract_inverted_index":{"Sparse":[0],"general":[1,76],"matrix-matrix":[2],"multiplication":[3],"(SpGEMM)":[4],"is":[5,74],"challenging":[6],"especially":[7],"on":[8,46],"graphic":[9],"accelerators.":[10],"Existing":[11],"solutions":[12],"do":[13],"not":[14],"fully":[15,75],"utilize":[16,28],"the":[17,21,29,38,43,83,93,98],"shared":[18,33],"memory":[19,34,99],"of":[20,60],"graphics":[22,30],"accelerator.":[23],"Our":[24,72],"proposal":[25,57],"could":[26],"effectively":[27],"accelerator's":[31],"on-chip":[32],"and":[35,77,96],"dynamically":[36],"assign":[37],"device":[39],"resources":[40],"by":[41,91],"grouping":[42],"rows":[44],"based":[45],"a":[47],"hybrid":[48],"strategy":[49,80],"for":[50],"load":[51],"balancing.":[52],"Experiments":[53],"show":[54],"that":[55],"our":[56,78],"achieves":[58],"speedups":[59],"up":[61],"to":[62,68,87],"x7.43":[63],"in":[64],"double":[65],"precision":[66],"compared":[67],"existing":[69],"SpGEMM":[70,84],"libraries.":[71],"implementation":[73],"optimization":[79],"adaptively":[81],"processes":[82],"workload":[85],"row-wise":[86],"substantially":[88],"improve":[89],"performance":[90],"decreasing":[92],"work":[94],"complexity":[95],"utilizing":[97],"hierarchy":[100],"more":[101],"effectively.":[102]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
