{"id":"https://openalex.org/W7128539017","doi":"https://doi.org/10.48550/arxiv.2602.08296","title":"MonkeyTree: Near-Minimal Congestion for Multi-tenant Training via Migration","display_name":"MonkeyTree: Near-Minimal Congestion for Multi-tenant Training via Migration","publication_year":2026,"publication_date":"2026-02-09","ids":{"openalex":"https://openalex.org/W7128539017","doi":"https://doi.org/10.48550/arxiv.2602.08296"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.08296","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125506359","display_name":"Anton A. Zabreyko","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zabreyko, Anton A.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125524194","display_name":"Weiyang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Weiyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5058729992","display_name":"Manya Ghobadi","orcid":"https://orcid.org/0000-0002-4095-1519"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ghobadi, Manya","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5125506359"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.7674999833106995,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.7674999833106995,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.04899999871850014,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.042399998754262924,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/network-packet","display_name":"Network packet","score":0.6047999858856201},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5428000092506409},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.47049999237060547},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.46540001034736633},{"id":"https://openalex.org/keywords/linear-programming","display_name":"Linear programming","score":0.39640000462532043},{"id":"https://openalex.org/keywords/network-congestion","display_name":"Network congestion","score":0.38940000534057617},{"id":"https://openalex.org/keywords/flow-network","display_name":"Flow network","score":0.3880000114440918},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.38089999556541443},{"id":"https://openalex.org/keywords/network-topology","display_name":"Network topology","score":0.37040001153945923}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7271999716758728},{"id":"https://openalex.org/C158379750","wikidata":"https://www.wikidata.org/wiki/Q214111","display_name":"Network packet","level":2,"score":0.6047999858856201},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5428000092506409},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.47049999237060547},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.46540001034736633},{"id":"https://openalex.org/C41045048","wikidata":"https://www.wikidata.org/wiki/Q202843","display_name":"Linear programming","level":2,"score":0.39640000462532043},{"id":"https://openalex.org/C195563490","wikidata":"https://www.wikidata.org/wiki/Q180368","display_name":"Network congestion","level":3,"score":0.38940000534057617},{"id":"https://openalex.org/C114809511","wikidata":"https://www.wikidata.org/wiki/Q1412924","display_name":"Flow network","level":2,"score":0.3880000114440918},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.38089999556541443},{"id":"https://openalex.org/C199845137","wikidata":"https://www.wikidata.org/wiki/Q145490","display_name":"Network topology","level":2,"score":0.37040001153945923},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.3693999946117401},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.3646000027656555},{"id":"https://openalex.org/C54108766","wikidata":"https://www.wikidata.org/wiki/Q391064","display_name":"Packet loss","level":3,"score":0.35910001397132874},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3465999960899353},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.3368000090122223},{"id":"https://openalex.org/C2781162219","wikidata":"https://www.wikidata.org/wiki/Q26250693","display_name":"Replicate","level":2,"score":0.33169999718666077},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3280999958515167},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.32359999418258667},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.30979999899864197},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.29660001397132874},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2921999990940094},{"id":"https://openalex.org/C77553402","wikidata":"https://www.wikidata.org/wiki/Q13222579","display_name":"Upper and lower bounds","level":2,"score":0.28459998965263367},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.28290000557899475},{"id":"https://openalex.org/C2779888511","wikidata":"https://www.wikidata.org/wiki/Q244156","display_name":"Traffic congestion","level":2,"score":0.2768000066280365},{"id":"https://openalex.org/C159631557","wikidata":"https://www.wikidata.org/wiki/Q1546066","display_name":"Networking hardware","level":2,"score":0.2759999930858612},{"id":"https://openalex.org/C137635306","wikidata":"https://www.wikidata.org/wiki/Q182667","display_name":"Pareto principle","level":2,"score":0.27129998803138733},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.26899999380111694},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.2651999890804291},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.26460000872612},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.26429998874664307},{"id":"https://openalex.org/C44154836","wikidata":"https://www.wikidata.org/wiki/Q45045","display_name":"Simulation","level":1,"score":0.2565999925136566},{"id":"https://openalex.org/C38349280","wikidata":"https://www.wikidata.org/wiki/Q1434290","display_name":"Flow (mathematics)","level":2,"score":0.2547999918460846},{"id":"https://openalex.org/C194051981","wikidata":"https://www.wikidata.org/wiki/Q1337691","display_name":"Economic shortage","level":3,"score":0.2508000135421753}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.08296","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.08296","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.08296","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.08296","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities","score":0.6368545889854431}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,139,188],"present":[1],"MonkeyTree,":[2],"the":[3,160,220],"first":[4],"system":[5,183],"to":[6,103,135,150,162,199],"mitigate":[7],"network":[8],"congestion":[9,33],"in":[10],"multi-tenant":[11],"GPU":[12],"clusters":[13,196],"through":[14],"job-migration":[15],"based":[16],"defragmentation":[17,124],"rather":[18],"than":[19],"network-layer":[20],"techniques.":[21],"As":[22],"cloud":[23],"operators":[24],"co-locate":[25],"ML":[26,73],"training":[27,35,74],"jobs":[28],"on":[29,47,205,224],"shared,":[30],"oversubscribed":[31],"networks,":[32],"degrades":[34],"throughput":[36],"for":[37],"over":[38,176,219],"a":[39,85,119,141,192,206,225,231,235],"third":[40],"of":[41,72,182,197,227,252],"jobs.":[42],"Prior":[43],"approaches":[44],"either":[45],"rely":[46],"routing":[48],"and":[49,158,203,240],"flow":[50,82],"scheduling--which":[51],"we":[52],"show":[53],"have":[54],"fundamental":[55],"limits":[56],"when":[57],"traffic":[58],"exceeds":[59],"capacity--or":[60],"require":[61],"costly":[62],"full-bisection":[63],"bandwidth":[64],"topologies":[65],"with":[66,105,165,230],"packet":[67],"spraying.":[68],"MonkeyTree":[69,122,190,210,243],"exploits":[70],"characteristics":[71],"traffic:":[75],"ring-based":[76],"collectives":[77],"generate":[78],"exactly":[79],"one":[80],"cross-rack":[81,154],"per":[83,156,168,186],"rack":[84],"job":[86,213,246],"spans,":[87],"making":[88,100],"congestion-free":[89],"placements":[90],"achievable.":[91],"The":[92],"sparse":[93],"constraint":[94],"structure":[95],"admits":[96],"abundant":[97],"valid":[98],"configurations,":[99],"them":[101],"easy":[102],"reach":[104],"few":[106,120],"migrations.":[107],"Once":[108],"reached,":[109],"low":[110],"fragmentation":[111,137],"is":[112,171],"self-reinforcing,":[113],"as":[114,125],"new":[115],"arrivals":[116],"disturb":[117],"only":[118,179],"racks.":[121],"formulates":[123],"an":[126],"integer":[127],"linear":[128],"program":[129],"that":[130],"minimizes":[131],"worker":[132],"movements,":[133],"subject":[134],"per-rack":[136],"bounds.":[138],"prove":[140],"tight":[142],"bound":[143],"showing":[144],"any":[145],"placement":[146],"can":[147],"be":[148],"defragmented":[149],"at":[151],"most":[152],"two":[153],"fragments":[155],"ToR,":[157],"extend":[159],"formulation":[161],"hybrid":[163],"parallelism":[164],"multiple":[166],"rings":[167],"server.":[169],"Migration":[170],"implemented":[172],"via":[173],"in-memory":[174],"checkpoint-and-restore":[175],"RDMA,":[177],"incurring":[178],"9.02":[180],"seconds":[181],"overhead":[184],"end-to-end":[185],"worker.":[187],"evaluate":[189],"using":[191],"custom":[193],"simulator":[194],"modeling":[195],"up":[198],"2,048":[200,241],"H200":[201],"GPUs":[202,229],"prototype":[204],"five-node":[207],"A100":[208],"testbed.":[209],"improves":[211],"average":[212],"completion":[214,247],"time":[215,248],"by":[216],"14":[217],"percent":[218,251],"next":[221],"best":[222],"baseline":[223],"cluster":[226],"1,024":[228],"4:1":[232],"oversubscription.":[233],"With":[234],"high":[236],"16:1":[237],"oversubscription":[238],"ratio":[239],"GPUs,":[242],"keeps":[244],"p99":[245],"within":[249],"5":[250],"ideal.":[253]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-11T00:00:00"}
