{"id":"https://openalex.org/W7140196564","doi":"https://doi.org/10.48550/arxiv.2603.21444","title":"Communication-Avoiding SpGEMM via Trident Partitioning on Hierarchical GPU Interconnects","display_name":"Communication-Avoiding SpGEMM via Trident Partitioning on Hierarchical GPU Interconnects","publication_year":2026,"publication_date":"2026-03-22","ids":{"openalex":"https://openalex.org/W7140196564","doi":"https://doi.org/10.48550/arxiv.2603.21444"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.21444","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21444","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.21444","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Bellavita, Julian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bellavita, Julian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Pichetti, Lorenzo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pichetti, Lorenzo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Pasquali, Thomas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pasquali, Thomas","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Vella, Flavio","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vella, Flavio","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Guidi, Giulia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guidi, Giulia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.7408000230789185,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.7408000230789185,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.07429999858140945,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.03350000083446503,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.8560000061988831},{"id":"https://openalex.org/keywords/trident","display_name":"Trident","score":0.850600004196167},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5267999768257141},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.5238999724388123},{"id":"https://openalex.org/keywords/boosting","display_name":"Boosting (machine learning)","score":0.4812999963760376},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.47369998693466187},{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.42980000376701355},{"id":"https://openalex.org/keywords/implementation","display_name":"Implementation","score":0.42149999737739563},{"id":"https://openalex.org/keywords/node","display_name":"Node (physics)","score":0.4056999981403351}],"concepts":[{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.8560000061988831},{"id":"https://openalex.org/C2777909563","wikidata":"https://www.wikidata.org/wiki/Q271628","display_name":"Trident","level":2,"score":0.850600004196167},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.807699978351593},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5864999890327454},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5267999768257141},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.5238999724388123},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.4812999963760376},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.47369998693466187},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.42980000376701355},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.42149999737739563},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.4056999981403351},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.36550000309944153},{"id":"https://openalex.org/C2778837361","wikidata":"https://www.wikidata.org/wiki/Q2450880","display_name":"Exascale computing","level":3,"score":0.3628000020980835},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.3497999906539917},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.32600000500679016},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.31679999828338623},{"id":"https://openalex.org/C150552126","wikidata":"https://www.wikidata.org/wiki/Q339387","display_name":"SIMD","level":2,"score":0.3043999969959259},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.29899999499320984},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.29840001463890076},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.29030001163482666},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.2879999876022339},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.2874999940395355},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.2847999930381775},{"id":"https://openalex.org/C147297375","wikidata":"https://www.wikidata.org/wiki/Q6674930","display_name":"Look-ahead","level":2,"score":0.28209999203681946},{"id":"https://openalex.org/C172430144","wikidata":"https://www.wikidata.org/wiki/Q17111997","display_name":"Symmetric multiprocessor system","level":2,"score":0.2799000144004822},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.2727999985218048},{"id":"https://openalex.org/C48903430","wikidata":"https://www.wikidata.org/wiki/Q491370","display_name":"Graph partition","level":3,"score":0.2669999897480011}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.21444","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21444","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.21444","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21444","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0,35],"multiplication":[1],"of":[2,56,132,189,209],"two":[3],"sparse":[4],"matrices,":[5,173],"known":[6],"as":[7],"SpGEMM,":[8],"is":[9,31,139],"a":[10,112,161,180,184],"key":[11],"kernel":[12],"in":[13,92,211],"scientific":[14],"computing":[15],"and":[16,26,49,58,88,96,122,129,149],"large-scale":[17],"data":[18,94],"analytics,":[19],"underpinning":[20],"graph":[21],"algorithms,":[22],"machine":[23],"learning,":[24],"simulations,":[25],"computational":[27],"biology,":[28],"where":[29],"sparsity":[30,37],"often":[32],"highly":[33],"unstructured.":[34],"unstructured":[36,172],"makes":[38],"achieving":[39,174,216],"high":[40],"performance":[41],"challenging":[42],"because":[43],"it":[44],"limits":[45],"both":[46],"memory":[47],"efficiency":[48],"scalability.":[50],"In":[51],"distributed":[52,115],"memory,":[53],"the":[54,84,101,127,140,155,207],"cost":[55],"exchanging":[57],"merging":[59],"partial":[60],"products":[61],"across":[62,165],"nodes":[63],"further":[64],"constrains":[65],"performance.":[66],"These":[67],"issues":[68],"are":[69],"exacerbated":[70],"on":[71,171,200],"modern":[72,133],"heterogeneous":[73,130],"supercomputers":[74],"with":[75,183],"deep,":[76],"hierarchical":[77,128],"GPU":[78],"interconnects.":[79],"Current":[80],"SpGEMM":[81,116,182],"implementations":[82],"overlook":[83],"gap":[85],"between":[86,158],"intra-node":[87,103],"inter-node":[89],"bandwidth,":[90],"resulting":[91],"unnecessary":[93],"movement":[95],"synchronization":[97],"not":[98],"fully":[99],"exploiting":[100],"fast":[102],"interconnect.":[104,135],"To":[105],"address":[106],"these":[107],"challenges,":[108],"we":[109,168,205],"introduce":[110],"Trident,":[111],"hierarchy-aware":[113,147],"2D":[114,181],"algorithm":[117],"that":[118],"uses":[119],"communication-avoiding":[120],"techniques":[121],"asynchronous":[123],"communication":[124,152,194],"to":[125,137,164,176,198,218,222],"exploit":[126],"architecture":[131],"supercomputing":[134],"Central":[136],"Trident":[138,170,191,210],"novel":[141],"trident":[142],"partitioning":[143],"scheme,":[144],"which":[145],"enables":[146],"decomposition":[148],"reduces":[150,192],"internode":[151,193],"by":[153,196],"leveraging":[154],"higher":[156],"bandwidth":[157],"GPUs":[159],"within":[160],"node":[162],"compared":[163,221],"nodes.":[166],"Here,":[167],"evaluate":[169],"up":[175,197,213,217],"$2.38\\times$":[177],"speedup":[178,188,220],"over":[179],"corresponding":[185],"geometric":[186],"mean":[187],"$1.54\\times$.":[190],"volume":[195],"$2\\times$":[199,219],"NERSC's":[201],"Perlmutter":[202],"supercomputer.":[203],"Furthermore,":[204],"demonstrate":[206],"effectiveness":[208],"speeding":[212],"Markov":[214],"Clustering,":[215],"competing":[223],"strategies.":[224]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-25T00:00:00"}
