{"id":"https://openalex.org/W4416963636","doi":"https://doi.org/10.1109/islped65674.2025.11261759","title":"Towards Zero-Stall Matrix Multiplication on Energy-Efficient RISC-V Clusters for Machine Learning Acceleration","display_name":"Towards Zero-Stall Matrix Multiplication on Energy-Efficient RISC-V Clusters for Machine Learning Acceleration","publication_year":2025,"publication_date":"2025-08-06","ids":{"openalex":"https://openalex.org/W4416963636","doi":"https://doi.org/10.1109/islped65674.2025.11261759"},"language":null,"primary_location":{"id":"doi:10.1109/islped65674.2025.11261759","is_oa":false,"landing_page_url":"https://doi.org/10.1109/islped65674.2025.11261759","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM International Symposium on Low Power Electronics and Design (ISLPED)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5094583316","display_name":"Luca Colagrande","orcid":"https://orcid.org/0000-0002-7986-1975"},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Luca Colagrande","raw_affiliation_strings":["ETH Zurich,Integrated Systems Laboratory (IIS),Zurich,Switzerland"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ETH Zurich,Integrated Systems Laboratory (IIS),Zurich,Switzerland","institution_ids":["https://openalex.org/I35440088"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120630936","display_name":"Lorenzo Leone","orcid":null},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Lorenzo Leone","raw_affiliation_strings":["ETH Zurich,Integrated Systems Laboratory (IIS),Zurich,Switzerland"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ETH Zurich,Integrated Systems Laboratory (IIS),Zurich,Switzerland","institution_ids":["https://openalex.org/I35440088"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120540956","display_name":"Maximilian Coco","orcid":null},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Maximilian Coco","raw_affiliation_strings":["ETH Zurich,D-ITET,Zurich,Switzerland"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ETH Zurich,D-ITET,Zurich,Switzerland","institution_ids":["https://openalex.org/I35440088"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120582400","display_name":"Andrei Deaconeasa","orcid":null},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Andrei Deaconeasa","raw_affiliation_strings":["ETH Zurich,D-ITET,Zurich,Switzerland"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ETH Zurich,D-ITET,Zurich,Switzerland","institution_ids":["https://openalex.org/I35440088"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043408422","display_name":"Luca Benini","orcid":"https://orcid.org/0000-0001-8068-3806"},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Luca Benini","raw_affiliation_strings":["ETH Zurich,Integrated Systems Laboratory (IIS),Zurich,Switzerland"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ETH Zurich,Integrated Systems Laboratory (IIS),Zurich,Switzerland","institution_ids":["https://openalex.org/I35440088"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.8294,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.95367266,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8302000164985657,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8302000164985657,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.10589999705553055,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.006899999920278788,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.5626999735832214},{"id":"https://openalex.org/keywords/efficient-energy-use","display_name":"Efficient energy use","score":0.5397999882698059},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.5393000245094299},{"id":"https://openalex.org/keywords/acceleration","display_name":"Acceleration","score":0.524399995803833},{"id":"https://openalex.org/keywords/microarchitecture","display_name":"Microarchitecture","score":0.47780001163482666},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4578000009059906},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.4327000081539154},{"id":"https://openalex.org/keywords/cluster","display_name":"Cluster (spacecraft)","score":0.42739999294281006},{"id":"https://openalex.org/keywords/energy","display_name":"Energy (signal processing)","score":0.42570000886917114}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.801800012588501},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.5626999735832214},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5422999858856201},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.5397999882698059},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.5393000245094299},{"id":"https://openalex.org/C117896860","wikidata":"https://www.wikidata.org/wiki/Q11376","display_name":"Acceleration","level":2,"score":0.524399995803833},{"id":"https://openalex.org/C107598950","wikidata":"https://www.wikidata.org/wiki/Q259864","display_name":"Microarchitecture","level":2,"score":0.47780001163482666},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4578000009059906},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.4327000081539154},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.42739999294281006},{"id":"https://openalex.org/C186370098","wikidata":"https://www.wikidata.org/wiki/Q442787","display_name":"Energy (signal processing)","level":2,"score":0.42570000886917114},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.40939998626708984},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.39980000257492065},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.3887999951839447},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.3544999957084656},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.31709998846054077},{"id":"https://openalex.org/C13164978","wikidata":"https://www.wikidata.org/wiki/Q600158","display_name":"Hardware acceleration","level":3,"score":0.3147999942302704},{"id":"https://openalex.org/C160191386","wikidata":"https://www.wikidata.org/wiki/Q868299","display_name":"Control flow","level":2,"score":0.3100999891757965},{"id":"https://openalex.org/C202491316","wikidata":"https://www.wikidata.org/wiki/Q272683","display_name":"Instruction set","level":2,"score":0.30000001192092896},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2973000109195709},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.2856000065803528},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2827000021934509},{"id":"https://openalex.org/C115051666","wikidata":"https://www.wikidata.org/wiki/Q6522493","display_name":"Ranging","level":2,"score":0.28220000863075256},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.27149999141693115},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.27149999141693115},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.2632000148296356},{"id":"https://openalex.org/C60798267","wikidata":"https://www.wikidata.org/wiki/Q1226939","display_name":"Division (mathematics)","level":2,"score":0.26159998774528503},{"id":"https://openalex.org/C2780150128","wikidata":"https://www.wikidata.org/wiki/Q21948731","display_name":"Extreme learning machine","level":3,"score":0.2612999975681305},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.25949999690055847}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/islped65674.2025.11261759","is_oa":false,"landing_page_url":"https://doi.org/10.1109/islped65674.2025.11261759","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM International Symposium on Low Power Electronics and Design (ISLPED)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W1531042374","https://openalex.org/W2068918965","https://openalex.org/W2113124853","https://openalex.org/W2113558024","https://openalex.org/W2120636791","https://openalex.org/W2132829148","https://openalex.org/W2135418442","https://openalex.org/W2334795732","https://openalex.org/W2547964522","https://openalex.org/W2606722458","https://openalex.org/W2759971396","https://openalex.org/W2963255460","https://openalex.org/W3013692244","https://openalex.org/W3191906639","https://openalex.org/W4386764021","https://openalex.org/W4402475796","https://openalex.org/W4402475804","https://openalex.org/W4402475823","https://openalex.org/W4402475844","https://openalex.org/W4402475848","https://openalex.org/W4406265948","https://openalex.org/W4406983056","https://openalex.org/W4408151419","https://openalex.org/W4409132176"],"related_works":[],"abstract_inverted_index":{"The":[0],"growing":[1],"computational":[2],"demands":[3],"of":[4,13,37,192],"machine":[5],"learning":[6],"(ML)":[7],"workloads":[8],"have":[9],"driven":[10],"the":[11,158],"design":[12],"ML":[14,31,90],"accelerators":[15,32],"aiming":[16],"at":[17],"an":[18],"optimal":[19],"tradeoff":[20],"between":[21,145],"efficiency":[22,155],"and":[23,77,121,147,152,167],"flexibility.":[24],"A":[25],"widely":[26],"explored":[27],"architecture":[28],"for":[29,50],"flexible":[30],"is":[33],"based":[34,93],"on":[35,94],"clusters":[36],"lightweight":[38],"instruction":[39,48,60],"processors":[40],"sharing":[41],"multi-banked":[42],"L1":[43,136],"memory,":[44],"augmented":[45],"with":[46,65,174],"specialized":[47,171],"extensions":[49,61],"key":[51],"ML-related":[52],"computations,":[53],"such":[54],"as":[55],"matrix":[56],"multiplication":[57],"(matmul).":[58],"However,":[59],"should":[62],"be":[63],"coupled":[64],"microarchitectural":[66],"optimizations":[67],"that":[68,103],"remove":[69,118],"inefficiencies":[70,106],"due":[71],"to":[72,117,131,169],"control":[73,119],"flow":[74],"(loop":[75],"handling)":[76],"memory":[78,124],"access,":[79],"without":[80],"drastically":[81],"increasing":[82],"processor":[83],"complexity.":[84],"Moving":[85],"from":[86],"a":[87,99,122,127,170,183,188],"state-of-the-art":[88],"(SoA)":[89],"accelerator":[91],"cluster":[92],"RISC-V":[95,161],"processors,":[96],"we":[97,141],"propose":[98],"low-overhead":[100],"optimized":[101],"microarchitecture":[102],"eliminates":[104],"these":[105,139],"almost":[107],"entirely":[108],"while":[109,181],"retaining":[110],"programmability.":[111],"We":[112,163],"introduce":[113],"\"zero-overhead":[114],"loop":[115],"nests\"":[116],"overheads,":[120],"\"zero-conflict":[123],"subsystem\",":[125],"leveraging":[126],"novel":[128],"double-buffering-aware":[129],"interconnect,":[130],"eliminate":[132],"bank":[133],"conflicts":[134],"in":[135,178],"memory.":[137],"With":[138],"enhancements,":[140],"attain":[142],"near-ideal":[143],"utilizations":[144,166],"96.1%":[146],"99.4%,":[148],"achieving":[149],"11%":[150],"performance":[151,168],"8%":[153],"energy":[154,179],"improvements":[156],"over":[157],"baseline":[159],"SoA":[160,172],"cluster.":[162],"demonstrate":[164],"comparable":[165],"accelerator,":[173],"only":[175],"12%":[176],"difference":[177],"efficiency,":[180],"providing":[182],"fully-programmable":[184],"general-purpose":[185],"solution":[186],"supporting":[187],"significantly":[189],"wider":[190],"range":[191],"workloads.":[193]},"counts_by_year":[{"year":2026,"cited_by_count":3}],"updated_date":"2026-06-19T17:40:00.097472","created_date":"2025-12-03T00:00:00"}
