{"id":"https://openalex.org/W7119486523","doi":"https://doi.org/10.1145/3773656.3773663","title":"Towards Unified Acceleration: Weight-Stationary GEMM on HPC-oriented Elastic CGRAs","display_name":"Towards Unified Acceleration: Weight-Stationary GEMM on HPC-oriented Elastic CGRAs","publication_year":2026,"publication_date":"2026-01-09","ids":{"openalex":"https://openalex.org/W7119486523","doi":"https://doi.org/10.1145/3773656.3773663"},"language":null,"primary_location":{"id":"doi:10.1145/3773656.3773663","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3773656.3773663","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3773656.3773663","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5082052909","display_name":"Chenlin Shi","orcid":"https://orcid.org/0000-0002-3016-1628"},"institutions":[{"id":"https://openalex.org/I20529979","display_name":"University of Electro-Communications","ror":"https://ror.org/02x73b849","country_code":"JP","type":"education","lineage":["https://openalex.org/I20529979"]},{"id":"https://openalex.org/I4210129730","display_name":"RIKEN Center for Computational Science","ror":"https://ror.org/03r519674","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210129730"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Chenlin Shi","raw_affiliation_strings":["RIKEN Center for Computational Science, Kobe, Hyogo, Japan and The University of Electro-Communications, Chofu, Tokyo, Japan"],"raw_orcid":"https://orcid.org/0000-0002-3016-1628","affiliations":[{"raw_affiliation_string":"RIKEN Center for Computational Science, Kobe, Hyogo, Japan and The University of Electro-Communications, Chofu, Tokyo, Japan","institution_ids":["https://openalex.org/I20529979","https://openalex.org/I4210129730"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018217309","display_name":"Boma Anantasatya Adhi","orcid":"https://orcid.org/0000-0002-8165-9792"},"institutions":[{"id":"https://openalex.org/I4210129730","display_name":"RIKEN Center for Computational Science","ror":"https://ror.org/03r519674","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210129730"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Boma Anantasatya Adhi","raw_affiliation_strings":["RIKEN Center for Computational Science, Kobe, Hyogo, Japan"],"raw_orcid":"https://orcid.org/0000-0002-8165-9792","affiliations":[{"raw_affiliation_string":"RIKEN Center for Computational Science, Kobe, Hyogo, Japan","institution_ids":["https://openalex.org/I4210129730"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122340569","display_name":"Lin Teng","orcid":null},"institutions":[{"id":"https://openalex.org/I20529979","display_name":"University of Electro-Communications","ror":"https://ror.org/02x73b849","country_code":"JP","type":"education","lineage":["https://openalex.org/I20529979"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Lin Teng","raw_affiliation_strings":["The University of Electro-Communications, Chofu, Tokyo, Japan"],"raw_orcid":"https://orcid.org/0009-0006-9854-0081","affiliations":[{"raw_affiliation_string":"The University of Electro-Communications, Chofu, Tokyo, Japan","institution_ids":["https://openalex.org/I20529979"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jiaheng Liu","orcid":"https://orcid.org/0009-0003-1543-5277"},"institutions":[{"id":"https://openalex.org/I4210129730","display_name":"RIKEN Center for Computational Science","ror":"https://ror.org/03r519674","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210129730"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Jiaheng Liu","raw_affiliation_strings":["RIKEN Center for Computational Science, Kobe, Hyogo, Japan"],"raw_orcid":"https://orcid.org/0009-0003-1543-5277","affiliations":[{"raw_affiliation_string":"RIKEN Center for Computational Science, Kobe, Hyogo, Japan","institution_ids":["https://openalex.org/I4210129730"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047490989","display_name":"Shinobu Miwa","orcid":"https://orcid.org/0000-0003-0315-3216"},"institutions":[{"id":"https://openalex.org/I20529979","display_name":"University of Electro-Communications","ror":"https://ror.org/02x73b849","country_code":"JP","type":"education","lineage":["https://openalex.org/I20529979"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shinobu Miwa","raw_affiliation_strings":["The University of Electro-Communications, Chofu, Tokyo, Japan"],"raw_orcid":"https://orcid.org/0000-0003-0315-3216","affiliations":[{"raw_affiliation_string":"The University of Electro-Communications, Chofu, Tokyo, Japan","institution_ids":["https://openalex.org/I20529979"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5025480991","display_name":"Kentaro Sano","orcid":null},"institutions":[{"id":"https://openalex.org/I4210129730","display_name":"RIKEN Center for Computational Science","ror":"https://ror.org/03r519674","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210129730"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Kentaro Sano","raw_affiliation_strings":["RIKEN Center for Computational Science, Kobe, Hyogo, Japan"],"raw_orcid":"https://orcid.org/0000-0002-6681-4192","affiliations":[{"raw_affiliation_string":"RIKEN Center for Computational Science, Kobe, Hyogo, Japan","institution_ids":["https://openalex.org/I4210129730"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5082052909"],"corresponding_institution_ids":["https://openalex.org/I20529979","https://openalex.org/I4210129730"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.03314229,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"102","last_page":"111"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.4422999918460846,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.4422999918460846,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.32910001277923584,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.03660000115633011,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.5824999809265137},{"id":"https://openalex.org/keywords/acceleration","display_name":"Acceleration","score":0.5669999718666077},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.414000004529953},{"id":"https://openalex.org/keywords/softmax-function","display_name":"Softmax function","score":0.3968000113964081},{"id":"https://openalex.org/keywords/computational-complexity-theory","display_name":"Computational complexity theory","score":0.357699990272522},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.35120001435279846},{"id":"https://openalex.org/keywords/parallel-processing","display_name":"Parallel processing","score":0.3488999903202057},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.33169999718666077}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7540000081062317},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.5824999809265137},{"id":"https://openalex.org/C117896860","wikidata":"https://www.wikidata.org/wiki/Q11376","display_name":"Acceleration","level":2,"score":0.5669999718666077},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5504000186920166},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.542900025844574},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.414000004529953},{"id":"https://openalex.org/C188441871","wikidata":"https://www.wikidata.org/wiki/Q7554146","display_name":"Softmax function","level":3,"score":0.3968000113964081},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.357699990272522},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.35120001435279846},{"id":"https://openalex.org/C106515295","wikidata":"https://www.wikidata.org/wiki/Q26806595","display_name":"Parallel processing","level":2,"score":0.3488999903202057},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.34540000557899475},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.33169999718666077},{"id":"https://openalex.org/C158622935","wikidata":"https://www.wikidata.org/wiki/Q660848","display_name":"Nonlinear system","level":2,"score":0.3301999866962433},{"id":"https://openalex.org/C2982832238","wikidata":"https://www.wikidata.org/wiki/Q5531640","display_name":"General purpose","level":2,"score":0.3188000023365021},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.3181000053882599},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.31439998745918274},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.311599999666214},{"id":"https://openalex.org/C96147967","wikidata":"https://www.wikidata.org/wiki/Q190686","display_name":"Subroutine","level":2,"score":0.3107999861240387},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3001999855041504},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.2980000078678131},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.2946000099182129},{"id":"https://openalex.org/C55526617","wikidata":"https://www.wikidata.org/wiki/Q719375","display_name":"Operand","level":2,"score":0.29350000619888306},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2596000134944916},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.2549999952316284}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3773656.3773663","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3773656.3773663","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3773656.3773663","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3773656.3773663","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.42908164858818054,"id":"https://metadata.un.org/sdg/7","display_name":"Affordable and clean energy"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W2017369466","https://openalex.org/W2026659976","https://openalex.org/W2077791698","https://openalex.org/W2091596155","https://openalex.org/W2107076945","https://openalex.org/W2346205343","https://openalex.org/W2604319603","https://openalex.org/W2888872259","https://openalex.org/W2889517658","https://openalex.org/W2912425543","https://openalex.org/W2953212265","https://openalex.org/W2980612421","https://openalex.org/W3015998244","https://openalex.org/W3097528158","https://openalex.org/W3159763498","https://openalex.org/W3171209294","https://openalex.org/W3213528054","https://openalex.org/W4289827620","https://openalex.org/W4289827859","https://openalex.org/W4311839927","https://openalex.org/W4312942943","https://openalex.org/W4312968147","https://openalex.org/W4321637475","https://openalex.org/W4385574952","https://openalex.org/W4386763463","https://openalex.org/W4387250977","https://openalex.org/W4388838056","https://openalex.org/W4400411245","https://openalex.org/W4401017906","https://openalex.org/W4401211807","https://openalex.org/W4402870437","https://openalex.org/W4408894276","https://openalex.org/W4410400756"],"related_works":[],"abstract_inverted_index":{"Elastic":[0],"coarse-grained":[1],"reconfigurable":[2],"arrays":[3],"(CGRAs)":[4],"are":[5,67],"emerging":[6],"accelerators":[7,40],"that":[8,107,121],"combine":[9],"performance":[10,128],"efficiency":[11],"with":[12,103],"computational":[13,89,112],"flexibility,":[14],"making":[15],"them":[16],"well":[17],"suited":[18],"for":[19,91,132,149],"high-performance":[20],"computing":[21],"(HPC)":[22],"applications.":[23],"Recent":[24],"studies":[25],"have":[26],"also":[27],"explored":[28],"their":[29],"role":[30],"in":[31],"AI":[32,39,111,133,153],"acceleration:":[33],"some":[34],"works":[35],"integrate":[36],"CGRAs":[37],"into":[38],"to":[41,54],"handle":[42],"nonlinear":[43],"functions":[44],"such":[45],"as":[46],"softmax":[47],"and":[48,152],"layer":[49],"normalization,":[50],"while":[51,135],"others":[52],"attempt":[53],"map":[55],"general":[56,92],"matrix":[57],"multiplication":[58],"(GEMM)":[59],"kernels":[60],"directly":[61],"onto":[62],"CGRAs.":[63,116,161],"However,":[64],"these":[65],"approaches":[66],"fundamentally":[68],"limited,":[69],"focusing":[70],"either":[71],"on":[72,80],"nonlinear/HPC":[73],"tasks":[74,134],"without":[75],"efficient":[76],"GEMM":[77,81],"support,":[78],"or":[79],"acceleration":[82,148],"at":[83],"the":[84,88,110,144,157],"cost":[85],"of":[86,114,146,159],"sacrificing":[87],"capabilities":[90,113],"purpose":[93],"computing.":[94],"In":[95],"this":[96],"paper,":[97],"we":[98],"present":[99],"a":[100],"double-buffer":[101],"architecture":[102],"dedicated":[104],"weight-uploading":[105],"chains":[106],"significantly":[108],"enhance":[109],"elastic":[115,160],"Our":[117],"experimental":[118],"results":[119],"show":[120],"our":[122],"approach":[123],"achieves":[124],"3":[125],"\u00d7":[126],"higher":[127],"than":[129],"existing":[130],"methods":[131],"maintaining":[136],"HPC":[137,151],"application":[138],"processing":[139],"capabilities.":[140],"This":[141],"work":[142],"demonstrates":[143],"feasibility":[145],"unified":[147],"both":[150],"workloads,":[154],"thereby":[155],"expanding":[156],"applicability":[158]},"counts_by_year":[],"updated_date":"2026-01-10T23:39:48.068659","created_date":"2026-01-09T00:00:00"}
