{"id":"https://openalex.org/W4400230394","doi":"https://doi.org/10.1109/iscas58744.2024.10558334","title":"High-Utilization GPGPU Design for Accelerating GEMM Workloads: An Incremental Approach","display_name":"High-Utilization GPGPU Design for Accelerating GEMM Workloads: An Incremental Approach","publication_year":2024,"publication_date":"2024-05-19","ids":{"openalex":"https://openalex.org/W4400230394","doi":"https://doi.org/10.1109/iscas58744.2024.10558334"},"language":"en","primary_location":{"id":"doi:10.1109/iscas58744.2024.10558334","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iscas58744.2024.10558334","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Symposium on Circuits and Systems (ISCAS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5040108243","display_name":"Chongxi Wang","orcid":"https://orcid.org/0000-0003-1295-279X"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chongxi Wang","raw_affiliation_strings":["Chinese Academy of Sciences,State Key Lab of Processors, Institute of Computing Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,State Key Lab of Processors, Institute of Computing Technology,Beijing,China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049749857","display_name":"Penghao Song","orcid":"https://orcid.org/0000-0001-7423-1416"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Penghao Song","raw_affiliation_strings":["Chinese Academy of Sciences,State Key Lab of Processors, Institute of Computing Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,State Key Lab of Processors, Institute of Computing Technology,Beijing,China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101738617","display_name":"Haoyu Zhao","orcid":"https://orcid.org/0009-0008-8626-1274"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haoyu Zhao","raw_affiliation_strings":["Chinese Academy of Sciences,State Key Lab of Processors, Institute of Computing Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,State Key Lab of Processors, Institute of Computing Technology,Beijing,China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101657448","display_name":"Fuxin Zhang","orcid":"https://orcid.org/0000-0003-0430-3669"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fuxin Zhang","raw_affiliation_strings":["Chinese Academy of Sciences,State Key Lab of Processors, Institute of Computing Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,State Key Lab of Processors, Institute of Computing Technology,Beijing,China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100370333","display_name":"Jian Wang","orcid":"https://orcid.org/0000-0001-5416-0649"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jian Wang","raw_affiliation_strings":["Chinese Academy of Sciences,State Key Lab of Processors, Institute of Computing Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,State Key Lab of Processors, Institute of Computing Technology,Beijing,China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5018500461","display_name":"Longbing Zhang","orcid":"https://orcid.org/0009-0004-8374-890X"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Longbing Zhang","raw_affiliation_strings":["Chinese Academy of Sciences,State Key Lab of Processors, Institute of Computing Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,State Key Lab of Processors, Institute of Computing Technology,Beijing,China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5040108243"],"corresponding_institution_ids":["https://openalex.org/I19820366","https://openalex.org/I4210090176"],"apc_list":null,"apc_paid":null,"fwci":1.4628,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.81497633,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11338","display_name":"Advancements in Photolithography Techniques","score":0.9866999983787537,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8117671012878418},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.7480108737945557},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6386827230453491},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.2059471607208252},{"id":"https://openalex.org/keywords/graphics","display_name":"Graphics","score":0.13202333450317383}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8117671012878418},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.7480108737945557},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6386827230453491},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.2059471607208252},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.13202333450317383}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iscas58744.2024.10558334","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iscas58744.2024.10558334","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Symposium on Circuits and Systems (ISCAS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321133","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W2033486618","https://openalex.org/W2152839228","https://openalex.org/W2194775991","https://openalex.org/W2606722458","https://openalex.org/W2796649226","https://openalex.org/W2906043559","https://openalex.org/W2913790721","https://openalex.org/W2920798074","https://openalex.org/W2963989532","https://openalex.org/W3043303806","https://openalex.org/W3127736057","https://openalex.org/W3157657667","https://openalex.org/W3190062760","https://openalex.org/W4206745970","https://openalex.org/W4308090436","https://openalex.org/W4309672181","https://openalex.org/W4327694885","https://openalex.org/W4385245566","https://openalex.org/W6750448596","https://openalex.org/W6760272376"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2505380084","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W2086739451","https://openalex.org/W1980160788"],"abstract_inverted_index":{"General":[0,90],"Purpose":[1],"Graphics":[2],"Processing":[3],"Units":[4],"(GPGPUs)":[5],"have":[6,63],"been":[7],"employed":[8],"primarily":[9],"in":[10,19,37,67,177,182,202],"domains":[11],"such":[12],"as":[13],"graphics":[14],"acceleration":[15],"and":[16,59,158,214],"high-performance":[17],"computing":[18],"the":[20,23,30,45,51,100,106,112,115,119,123,126,138,187,199,206],"past.":[21],"However,":[22],"rise":[24],"of":[25,48,53,102,114,125,134,140,155,162,194,210,217],"artificial":[26],"intelligence":[27],"(AI),":[28],"particularly":[29],"computational":[31,46],"demands":[32,43],"associated":[33],"with":[34,165],"matrix":[35,54,103,127,218],"multiplications":[36],"AI":[38,69],"models,":[39],"has":[40],"presented":[41],"formidable":[42],"on":[44,143],"power":[47],"GPGPUs.":[49],"Consequently,":[50],"design":[52,76,101,175],"multiplication":[55],"units":[56,104],"within":[57,105,118],"GPGPUs":[58],"ensuring":[60],"their":[61],"utilization":[62,124],"become":[64],"key":[65],"issues":[66],"optimizing":[68],"workloads.":[70],"This":[71,95],"paper":[72],"explores":[73],"an":[74],"incremental":[75],"approach,":[77],"building":[78],"upon":[79],"a":[80,131,141,144,151,179,192],"Single":[81],"Instruction":[82],"Multiple":[83],"Threads":[84],"(SIMT)":[85],"GPGPU":[86,120,142,189,200],"architecture,":[87],"to":[88,121,169,186],"facilitate":[89],"Matrix":[91],"Multiply":[92],"(GEMM)":[93],"acceleration.":[94],"approach":[96,136],"encompasses":[97],"not":[98],"only":[99,178],"stream":[107],"processors":[108],"but,":[109],"more":[110],"crucially,":[111],"optimization":[113],"data":[116],"path":[117],"maximize":[122],"units.":[128],"We":[129],"present":[130],"practical":[132],"demonstration":[133],"our":[135],"through":[137],"fabrication":[139],"12":[145],"nm":[146],"CMOS":[147],"process":[148],"node,":[149],"achieving":[150],"core":[152],"clock":[153],"speed":[154],"1":[156],"GHz":[157],"INT8":[159],"peak":[160],"performance":[161],"8":[163],"TOPS":[164],"memory":[166],"bandwidth":[167],"limited":[168],"32":[170],"GB/s":[171],"LPDDR4-4000.":[172],"Notably,":[173],"this":[174,203],"results":[176],"6.57%":[180],"increase":[181],"chip":[183],"area":[184],"compared":[185],"original":[188],"design.":[190],"In":[191],"series":[193],"fair":[195],"GEMM":[196],"workload":[197],"tests,":[198],"implemented":[201],"work":[204],"outperforms":[205],"recent":[207],"three":[208],"generations":[209],"NVIDIA":[211],"GPGPUs\u2014V100,":[212],"T4,":[213],"A100\u2014in":[215],"terms":[216],"unit":[219],"utilization.":[220]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2}],"updated_date":"2026-04-02T15:55:50.835912","created_date":"2025-10-10T00:00:00"}
