{"id":"https://openalex.org/W7133353834","doi":"https://doi.org/10.48550/arxiv.2603.01629","title":"TeraPool: A Physical Design Aware, 1024 RISC-V Cores Shared-L1-Memory Scaled-up Cluster Design with High Bandwidth Main Memory Link","display_name":"TeraPool: A Physical Design Aware, 1024 RISC-V Cores Shared-L1-Memory Scaled-up Cluster Design with High Bandwidth Main Memory Link","publication_year":2026,"publication_date":"2026-03-02","ids":{"openalex":"https://openalex.org/W7133353834","doi":"https://doi.org/10.48550/arxiv.2603.01629"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.01629","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01629","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.01629","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128012976","display_name":"Yichao Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Yichao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127890056","display_name":"Marco Bertuletti","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bertuletti, Marco","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127930321","display_name":"Chi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Chi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127881561","display_name":"Samuel Riedel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Riedel, Samuel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121552380","display_name":"Diyou Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Diyou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127931247","display_name":"Bowen Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Bowen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Vanelli-Coralli, Alessandro","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vanelli-Coralli, Alessandro","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5127929713","display_name":"Luca Benini","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Benini, Luca","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5128012976"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9437000155448914,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9437000155448914,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.02410000003874302,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.005400000140070915,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interconnection","display_name":"Interconnection","score":0.5194000005722046},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.4569999873638153},{"id":"https://openalex.org/keywords/crossbar-switch","display_name":"Crossbar switch","score":0.44269999861717224},{"id":"https://openalex.org/keywords/cluster","display_name":"Cluster (spacecraft)","score":0.41339999437332153},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.4050000011920929},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.3993000090122223},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.3937000036239624},{"id":"https://openalex.org/keywords/efficient-energy-use","display_name":"Efficient energy use","score":0.3474999964237213}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7551000118255615},{"id":"https://openalex.org/C123745756","wikidata":"https://www.wikidata.org/wiki/Q1665949","display_name":"Interconnection","level":2,"score":0.5194000005722046},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4717999994754791},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.4569999873638153},{"id":"https://openalex.org/C29984679","wikidata":"https://www.wikidata.org/wiki/Q1929149","display_name":"Crossbar switch","level":2,"score":0.44269999861717224},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.41339999437332153},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.4050000011920929},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.3993000090122223},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.3937000036239624},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.3474999964237213},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.3249000012874603},{"id":"https://openalex.org/C130795937","wikidata":"https://www.wikidata.org/wiki/Q2561570","display_name":"Remote direct memory access","level":2,"score":0.3156000077724457},{"id":"https://openalex.org/C190475519","wikidata":"https://www.wikidata.org/wiki/Q544384","display_name":"Massively parallel","level":2,"score":0.31049999594688416},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.29510000348091125},{"id":"https://openalex.org/C197129107","wikidata":"https://www.wikidata.org/wiki/Q1921621","display_name":"Merge (version control)","level":2,"score":0.287200003862381},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.2831000089645386},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.2754000127315521},{"id":"https://openalex.org/C158379750","wikidata":"https://www.wikidata.org/wiki/Q214111","display_name":"Network packet","level":2,"score":0.2754000127315521},{"id":"https://openalex.org/C133875982","wikidata":"https://www.wikidata.org/wiki/Q764810","display_name":"Shared memory","level":2,"score":0.26919999718666077},{"id":"https://openalex.org/C2779602883","wikidata":"https://www.wikidata.org/wiki/Q15544750","display_name":"Memory architecture","level":2,"score":0.26159998774528503},{"id":"https://openalex.org/C117280010","wikidata":"https://www.wikidata.org/wiki/Q180944","display_name":"Register file","level":3,"score":0.25839999318122864},{"id":"https://openalex.org/C182019814","wikidata":"https://www.wikidata.org/wiki/Q1143830","display_name":"Resistive random-access memory","level":3,"score":0.2574999928474426},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.2551000118255615}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.01629","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01629","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.01629","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01629","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Affordable and clean energy","score":0.9067100882530212,"id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Shared":[0],"L1-memory":[1],"clusters":[2,34,234],"of":[3,33,77,153,169,179,207,217,231],"streamlined":[4],"instruction":[5],"processors":[6],"(processing":[7],"elements":[8],"-":[9],"PEs)":[10],"are":[11],"commonly":[12],"used":[13],"as":[14],"building":[15],"blocks":[16],"in":[17,51,125,211,236],"modern,":[18],"massively":[19],"parallel":[20],"computing":[21],"architectures":[22,28],"(e.g.":[23],"GP-GPUs).":[24],"Scaling":[25,64],"out":[26],"these":[27],"by":[29,41],"increasing":[30],"the":[31,42,60,66,75,151,170,176,186,215,228,232],"number":[32],"incurs":[35],"computational":[36],"and":[37,46,53,71,197],"power":[38],"overhead,":[39],"caused":[40],"requirement":[43],"to":[44,164,190,199,222],"split":[45],"merge":[47],"large":[48],"data":[49,166],"structures":[50],"chunks":[52,55],"move":[54],"across":[56],"memory":[57,111,146,160],"hierarchies":[58],"via":[59,112],"high-latency":[61],"global":[62],"interconnect.":[63],"up":[65,189,198],"cluster":[67,104,187,221],"reduces":[68],"buffering,":[69],"copy,":[70],"synchronization":[72],"overheads.":[73],"However,":[74],"complexity":[76],"a":[78,88,96,107,113,154,204,219,223],"fully":[79],"connected":[80],"cores-to-L1-memory":[81],"crossbar":[82],"grows":[83],"quadratically":[84],"with":[85],"PE-count,":[86],"posing":[87],"major":[89],"physical":[90],"implementation":[91],"challenge.":[92],"We":[93],"present":[94],"TeraPool,":[95],"physically":[97],"implementable,":[98],"&gt;1000":[99],"floating-point-capable":[100],"RISC-V":[101],"PEs":[102],"scaled-up":[103],"design,":[105],"sharing":[106],"Multi-MegaByte":[108],"&gt;4000-banked":[109],"L1":[110],"low":[114],"latency":[115],"hierarchical":[116,139],"interconnect":[117,141],"(1-7/9/11":[118],"cycles,":[119],"depending":[120],"on":[121,209],"target":[122],"frequency).":[123],"Implemented":[124],"12nm":[126],"FinFET":[127],"technology,":[128],"TeraPool":[129],"achieves":[130],"near-gigahertz":[131],"frequencies":[132],"(910MHz)":[133],"typical,":[134],"0.80":[135],"V/25C.":[136],"The":[137],"energy-efficient":[138],"PE-to-L1-memory":[140],"consumes":[142],"only":[143],"9-13.5pJ":[144],"for":[145],"bank":[147],"accesses,":[148],"just":[149],"0.74-1.1x":[150],"cost":[152],"FP32":[155],"FMA.":[156],"A":[157],"high-bandwidth":[158],"main":[159,182],"link":[161],"is":[162],"designed":[163],"manage":[165],"transfers":[167,174],"in/out":[168],"shared":[171],"L1,":[172],"sustaining":[173],"at":[175],"full":[177],"bandwidth":[178],"an":[180],"HBM2E":[181],"memory.":[183],"At":[184],"910MHz,":[185],"delivers":[188],"1.89":[191],"single":[192],"precision":[193],"TFLOP/s":[194],"peak":[195],"performance":[196],"200GFLOP/s/W":[200],"energy":[201],"efficiency":[202],"(at":[203],"high":[205],"IPC/PE":[206],"0.8":[208],"average)":[210],"benchmark":[212],"kernels,":[213],"demonstrating":[214],"feasibility":[216],"scaling":[218],"shared-L1":[220],"thousand":[224],"PEs,":[225],"four":[226],"times":[227],"PE":[229],"count":[230],"largest":[233],"reported":[235],"literature.":[237]},"counts_by_year":[],"updated_date":"2026-03-25T23:56:10.502304","created_date":"2026-03-04T00:00:00"}
