{"id":"https://openalex.org/W4415125482","doi":"https://doi.org/10.1109/icnp65844.2025.11192367","title":"Canvas: Scalable Collective Communication Scheduling for Large-Scale GPU Clusters","display_name":"Canvas: Scalable Collective Communication Scheduling for Large-Scale GPU Clusters","publication_year":2025,"publication_date":"2025-09-22","ids":{"openalex":"https://openalex.org/W4415125482","doi":"https://doi.org/10.1109/icnp65844.2025.11192367"},"language":"en","primary_location":{"id":"doi:10.1109/icnp65844.2025.11192367","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icnp65844.2025.11192367","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 33rd International Conference on Network Protocols (ICNP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5093548286","display_name":"Chenyang Hei","orcid":"https://orcid.org/0000-0001-5010-1529"},"institutions":[{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chenyang Hei","raw_affiliation_strings":["Northeastern University,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northeastern University,China","institution_ids":["https://openalex.org/I9224756"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077280762","display_name":"Yi Zhao","orcid":"https://orcid.org/0000-0001-5368-3595"},"institutions":[{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi Zhao","raw_affiliation_strings":["Northeastern University,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northeastern University,China","institution_ids":["https://openalex.org/I9224756"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083598234","display_name":"Fuliang Li","orcid":"https://orcid.org/0000-0001-9782-0053"},"institutions":[{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fuliang Li","raw_affiliation_strings":["Northeastern University,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northeastern University,China","institution_ids":["https://openalex.org/I9224756"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106856666","display_name":"Chengxi Gao","orcid":"https://orcid.org/0000-0003-1386-7394"},"institutions":[{"id":"https://openalex.org/I4210145761","display_name":"Shenzhen Institutes of Advanced Technology","ror":"https://ror.org/04gh4er46","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210145761"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chengxi Gao","raw_affiliation_strings":["Chinese Academy of Sciences,Shenzhen Institutes of Advanced Technology"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,Shenzhen Institutes of Advanced Technology","institution_ids":["https://openalex.org/I4210145761"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Tongrui Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tongrui Liu","raw_affiliation_strings":["Northeastern University,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northeastern University,China","institution_ids":["https://openalex.org/I9224756"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043755655","display_name":"Xiuzhu Sha","orcid":null},"institutions":[{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiuzhu Sha","raw_affiliation_strings":["Northeastern University,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northeastern University,China","institution_ids":["https://openalex.org/I9224756"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100326915","display_name":"Xingwei Wang","orcid":"https://orcid.org/0000-0003-2856-4716"},"institutions":[{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingwei Wang","raw_affiliation_strings":["Northeastern University,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northeastern University,China","institution_ids":["https://openalex.org/I9224756"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5093548286"],"corresponding_institution_ids":["https://openalex.org/I9224756"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.27896896,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"11"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12784","display_name":"Modular Robots and Swarm Intelligence","score":0.9786999821662903,"subfield":{"id":"https://openalex.org/subfields/2210","display_name":"Mechanical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12784","display_name":"Modular Robots and Swarm Intelligence","score":0.9786999821662903,"subfield":{"id":"https://openalex.org/subfields/2210","display_name":"Mechanical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9754999876022339,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.974399983882904,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7555000185966492},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.7196999788284302},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6061999797821045},{"id":"https://openalex.org/keywords/network-topology","display_name":"Network topology","score":0.4212000072002411},{"id":"https://openalex.org/keywords/fair-share-scheduling","display_name":"Fair-share scheduling","score":0.37310001254081726},{"id":"https://openalex.org/keywords/dynamic-priority-scheduling","display_name":"Dynamic priority scheduling","score":0.3544999957084656},{"id":"https://openalex.org/keywords/two-level-scheduling","display_name":"Two-level scheduling","score":0.3131999969482422},{"id":"https://openalex.org/keywords/round-robin-scheduling","display_name":"Round-robin scheduling","score":0.3116999864578247}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8489999771118164},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7555000185966492},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.7196999788284302},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.7084000110626221},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6061999797821045},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5665000081062317},{"id":"https://openalex.org/C199845137","wikidata":"https://www.wikidata.org/wiki/Q145490","display_name":"Network topology","level":2,"score":0.4212000072002411},{"id":"https://openalex.org/C31689143","wikidata":"https://www.wikidata.org/wiki/Q733809","display_name":"Fair-share scheduling","level":3,"score":0.37310001254081726},{"id":"https://openalex.org/C107568181","wikidata":"https://www.wikidata.org/wiki/Q5319000","display_name":"Dynamic priority scheduling","level":3,"score":0.3544999957084656},{"id":"https://openalex.org/C119948110","wikidata":"https://www.wikidata.org/wiki/Q7858726","display_name":"Two-level scheduling","level":4,"score":0.3131999969482422},{"id":"https://openalex.org/C175893541","wikidata":"https://www.wikidata.org/wiki/Q1196582","display_name":"Round-robin scheduling","level":4,"score":0.3116999864578247},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.30410000681877136},{"id":"https://openalex.org/C2777338717","wikidata":"https://www.wikidata.org/wiki/Q1762621","display_name":"Vendor","level":2,"score":0.2962999939918518},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.2930999994277954},{"id":"https://openalex.org/C55416958","wikidata":"https://www.wikidata.org/wiki/Q6206757","display_name":"Job shop scheduling","level":3,"score":0.28700000047683716},{"id":"https://openalex.org/C2781030343","wikidata":"https://www.wikidata.org/wiki/Q922437","display_name":"InfiniBand","level":2,"score":0.27889999747276306},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.2784999907016754},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26829999685287476},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.2655999958515167},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.2628999948501587},{"id":"https://openalex.org/C192126672","wikidata":"https://www.wikidata.org/wiki/Q1068715","display_name":"Telecommunications network","level":2,"score":0.2531000077724457},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.25189998745918274}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icnp65844.2025.11192367","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icnp65844.2025.11192367","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 33rd International Conference on Network Protocols (ICNP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":13,"referenced_works":["https://openalex.org/W2009571103","https://openalex.org/W2059300917","https://openalex.org/W2969388332","https://openalex.org/W3036878841","https://openalex.org/W3072623287","https://openalex.org/W3092260380","https://openalex.org/W3190806564","https://openalex.org/W4318541593","https://openalex.org/W4390044316","https://openalex.org/W4401176799","https://openalex.org/W4401176822","https://openalex.org/W4404955085","https://openalex.org/W4412610442"],"related_works":[],"abstract_inverted_index":{"State-of-the-art":[0],"deep":[1],"learning":[2],"models":[3],"rely":[4],"on":[5,17],"large":[6],"GPU":[7],"clusters":[8],"and":[9,70,79,122,138,142],"various":[10],"parallelism":[11],"strategies,":[12],"which":[13],"in":[14,41],"turn":[15],"depend":[16],"collective":[18],"communication":[19,72,119],"(CC)":[20],"operators":[21],"to":[22,57,93,107,117,132,154],"synchronize":[23],"data.":[24],"While":[25],"vendor":[26],"libraries":[27],"(e.g.,":[28],"NCCL,":[29],"RCCL)":[30],"provide":[31],"standard":[32],"CC":[33,81],"algorithms,":[34],"they":[35],"often":[36],"suffer":[37],"from":[38],"bandwidth":[39,134],"bottlenecks":[40],"imbalanced":[42],"topologies.":[43],"Recent":[44],"synthesis-based":[45],"methods":[46],"improve":[47],"performance":[48],"but":[49],"face":[50],"three":[51],"key":[52],"limitations:":[53],"poor":[54],"scalability":[55],"due":[56],"the":[58,95],"combinatorial":[59],"explosion":[60],"of":[61,65],"scheduling":[62,82,97,116],"space,":[63],"lack":[64],"support":[66],"for":[67,102,145],"multistage":[68],"execution,":[69],"suboptimal":[71],"throughput.":[73],"We":[74],"propose":[75],"Canvas,":[76],"a":[77],"scalable":[78],"near-optimal":[80],"framework":[83],"that":[84,128],"addresses":[85],"these":[86],"challenges.":[87],"Canvas":[88,129],"introduces:":[89],"(1)":[90],"Hierarchical":[91],"synthesis":[92],"decompose":[94],"global":[96],"problem":[98],"into":[99],"tractable":[100],"subproblems":[101],"scalability.":[103],"(2)":[104],"Collective":[105],"decomposition":[106],"enable":[108],"structured,":[109],"multi-stage":[110],"algorithm":[111],"generation.":[112],"(3)":[113],"Cross-micro-batch":[114],"pipeline":[115],"parallelize":[118],"across":[120],"micro-batches":[121],"maximize":[123],"link":[124],"utilization.":[125],"Evaluations":[126],"show":[127],"achieves":[130],"up":[131],"1.98\u00d7":[133],"speedup":[135],"over":[136,140],"TACCL":[137,152],"3.56\u00d7":[139],"TE-CCL,":[141],"synthesizes":[143],"algorithms":[144],"512-GPU":[146],"topologies":[147],"within":[148,157],"1.77":[149],"hours,":[150],"whereas":[151],"fails":[153],"produce":[155],"results":[156],"24":[158],"hours.":[159]},"counts_by_year":[],"updated_date":"2026-04-29T09:16:38.111599","created_date":"2025-10-14T00:00:00"}
