{"id":"https://openalex.org/W7143506327","doi":"https://doi.org/10.48550/arxiv.2603.26438","title":"A Lightweight High-Throughput Collective-Capable NoC for Large-Scale ML Accelerators","display_name":"A Lightweight High-Throughput Collective-Capable NoC for Large-Scale ML Accelerators","publication_year":2026,"publication_date":"2026-03-27","ids":{"openalex":"https://openalex.org/W7143506327","doi":"https://doi.org/10.48550/arxiv.2603.26438"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.26438","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26438","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.26438","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130936718","display_name":"Luca Colagrande","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Colagrande, Luca","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120630936","display_name":"Lorenzo Leone","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Leone, Lorenzo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130929360","display_name":"Chen Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130910578","display_name":"Tim Fischer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fischer, Tim","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130929409","display_name":"Raphael Roth","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Roth, Raphael","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130939299","display_name":"Luca Benini","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Benini, Luca","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.89410001039505,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.89410001039505,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.057999998331069946,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.006300000008195639,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/multicast","display_name":"Multicast","score":0.6690999865531921},{"id":"https://openalex.org/keywords/router","display_name":"Router","score":0.5810999870300293},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.48910000920295715},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.43220001459121704},{"id":"https://openalex.org/keywords/network-on-a-chip","display_name":"Network on a chip","score":0.41830000281333923},{"id":"https://openalex.org/keywords/interconnection","display_name":"Interconnection","score":0.38199999928474426},{"id":"https://openalex.org/keywords/unicast","display_name":"Unicast","score":0.34790000319480896}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7452999949455261},{"id":"https://openalex.org/C32295351","wikidata":"https://www.wikidata.org/wiki/Q899288","display_name":"Multicast","level":2,"score":0.6690999865531921},{"id":"https://openalex.org/C2775896111","wikidata":"https://www.wikidata.org/wiki/Q642560","display_name":"Router","level":2,"score":0.5810999870300293},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.48910000920295715},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.47699999809265137},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.43220001459121704},{"id":"https://openalex.org/C128519102","wikidata":"https://www.wikidata.org/wiki/Q339554","display_name":"Network on a chip","level":2,"score":0.41830000281333923},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.4000999927520752},{"id":"https://openalex.org/C123745756","wikidata":"https://www.wikidata.org/wiki/Q1665949","display_name":"Interconnection","level":2,"score":0.38199999928474426},{"id":"https://openalex.org/C11704745","wikidata":"https://www.wikidata.org/wiki/Q918337","display_name":"Unicast","level":3,"score":0.34790000319480896},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.34769999980926514},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.34470000863075256},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.3237999975681305},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.31299999356269836},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.29899999499320984},{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.29739999771118164},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.28929999470710754},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.26669999957084656}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.26438","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26438","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.26438","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26438","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7","score":0.8789209723472595}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"exponential":[1],"increase":[2],"in":[3,147,162],"Machine":[4],"Learning":[5],"(ML)":[6],"model":[7],"size":[8],"and":[9,38,71,121,127,133,166,173,184],"complexity":[10],"has":[11,41],"driven":[12],"unprecedented":[13],"demand":[14],"for":[15,75],"high-performance":[16],"acceleration":[17],"systems.":[18],"As":[19],"technology":[20],"scaling":[21],"enables":[22],"the":[23,34,76,93,99,144],"integration":[24],"of":[25,27,79,136],"thousands":[26],"computing":[28],"elements":[29],"onto":[30],"a":[31,55,88,108,179],"single":[32],"die,":[33],"boundary":[35],"between":[36,131],"distributed":[37],"on-chip":[39,45],"systems":[40],"blurred,":[42],"making":[43],"efficient":[44,64],"collective":[46],"communication":[47,142],"increasingly":[48],"critical.":[49],"In":[50],"this":[51],"work,":[52],"we":[53,118],"present":[54],"lightweight,":[56],"collective-capable":[57],"Network":[58],"on":[59,125],"Chip":[60],"(NoC)":[61],"that":[62,91],"supports":[63],"barrier":[65],"synchronization":[66],"alongside":[67],"scalable,":[68],"high-bandwidth":[69],"multicast":[70,126,172],"reduction":[72,128,174],"operations,":[73],"co-designed":[74],"next":[77],"generation":[78],"ML":[80],"accelerators.":[81],"We":[82],"introduce":[83],"Direct":[84],"Compute":[85],"Access":[86],"(DCA),":[87],"novel":[89],"paradigm":[90],"grants":[92],"interconnect":[94],"fabric":[95],"direct":[96],"access":[97],"to":[98,155,158,164,178,186],"cores'":[100],"computational":[101],"resources,":[102],"enabling":[103],"high-throughput":[104],"in-network":[105,115],"reductions":[106],"with":[107],"small":[109],"16.9%":[110],"router":[111],"area":[112],"overhead.":[113],"Through":[114],"hardware":[116],"acceleration,":[117],"achieve":[119],"5.3x":[120],"2.8x":[122],"geomean":[123],"speedups":[124],"operations":[129],"involving":[130],"1":[132],"32":[134],"KiB":[135],"data,":[137],"respectively.":[138],"Furthermore,":[139],"by":[140],"keeping":[141],"off":[143],"critical":[145],"path":[146],"GEMM":[148],"workloads,":[149],"these":[150],"features":[151],"allow":[152],"our":[153],"architecture":[154],"scale":[156],"efficiently":[157],"large":[159],"meshes,":[160],"resulting":[161],"up":[163,185],"3.8x":[165],"2.4x":[167],"estimated":[168,188],"performance":[169],"gains":[170],"through":[171],"support,":[175],"respectively,":[176],"compared":[177],"baseline":[180],"unicast":[181],"NoC":[182],"architecture,":[183],"1.17x":[187],"energy":[189],"savings.":[190]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-31T00:00:00"}
