{"id":"https://openalex.org/W4388076418","doi":"https://doi.org/10.1145/3620678.3624669","title":"Anticipatory Resource Allocation for ML Training","display_name":"Anticipatory Resource Allocation for ML Training","publication_year":2023,"publication_date":"2023-10-30","ids":{"openalex":"https://openalex.org/W4388076418","doi":"https://doi.org/10.1145/3620678.3624669"},"language":"en","primary_location":{"id":"doi:10.1145/3620678.3624669","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3620678.3624669","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3620678.3624669","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2023 ACM Symposium on Cloud Computing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3620678.3624669","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5002775676","display_name":"Tapan Chugh","orcid":"https://orcid.org/0000-0002-1507-6708"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]},{"id":"https://openalex.org/I201448701","display_name":"University of Washington","ror":"https://ror.org/00cvxb145","country_code":"US","type":"education","lineage":["https://openalex.org/I201448701"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Tapan Chugh","raw_affiliation_strings":["Microsoft Research, University of Washington"],"affiliations":[{"raw_affiliation_string":"Microsoft Research, University of Washington","institution_ids":["https://openalex.org/I1290206253","https://openalex.org/I201448701"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023881736","display_name":"Srikanth Kandula","orcid":"https://orcid.org/0000-0001-9494-6435"},"institutions":[{"id":"https://openalex.org/I4210164937","display_name":"Microsoft Research (United Kingdom)","ror":"https://ror.org/05k87vq12","country_code":"GB","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210164937"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Srikanth Kandula","raw_affiliation_strings":["Microsoft Research"],"affiliations":[{"raw_affiliation_string":"Microsoft Research","institution_ids":["https://openalex.org/I4210164937"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101497042","display_name":"Arvind Krishnamurthy","orcid":"https://orcid.org/0000-0002-9505-9528"},"institutions":[{"id":"https://openalex.org/I201448701","display_name":"University of Washington","ror":"https://ror.org/00cvxb145","country_code":"US","type":"education","lineage":["https://openalex.org/I201448701"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Arvind Krishnamurthy","raw_affiliation_strings":["University of Washington"],"affiliations":[{"raw_affiliation_string":"University of Washington","institution_ids":["https://openalex.org/I201448701"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013518229","display_name":"Ratul Mahajan","orcid":"https://orcid.org/0009-0005-8005-6948"},"institutions":[{"id":"https://openalex.org/I201448701","display_name":"University of Washington","ror":"https://ror.org/00cvxb145","country_code":"US","type":"education","lineage":["https://openalex.org/I201448701"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ratul Mahajan","raw_affiliation_strings":["University of Washington"],"affiliations":[{"raw_affiliation_string":"University of Washington","institution_ids":["https://openalex.org/I201448701"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5032872922","display_name":"Ishai Menache","orcid":"https://orcid.org/0000-0002-2540-236X"},"institutions":[{"id":"https://openalex.org/I4210164937","display_name":"Microsoft Research (United Kingdom)","ror":"https://ror.org/05k87vq12","country_code":"GB","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210164937"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Ishai Menache","raw_affiliation_strings":["Microsoft Research"],"affiliations":[{"raw_affiliation_string":"Microsoft Research","institution_ids":["https://openalex.org/I4210164937"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5002775676"],"corresponding_institution_ids":["https://openalex.org/I1290206253","https://openalex.org/I201448701"],"apc_list":null,"apc_paid":null,"fwci":1.815,"has_fulltext":true,"cited_by_count":6,"citation_normalized_percentile":{"value":0.85008439,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"410","last_page":"426"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.79632169008255},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.7542445659637451},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.63127601146698},{"id":"https://openalex.org/keywords/idle","display_name":"Idle","score":0.6251934170722961},{"id":"https://openalex.org/keywords/anticipation","display_name":"Anticipation (artificial intelligence)","score":0.5906316041946411},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5768155455589294},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.4662109315395355},{"id":"https://openalex.org/keywords/resource-allocation","display_name":"Resource allocation","score":0.41408833861351013},{"id":"https://openalex.org/keywords/real-time-computing","display_name":"Real-time computing","score":0.3907303214073181},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.384623646736145},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.2264363169670105},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.1571153700351715},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.15662220120429993},{"id":"https://openalex.org/keywords/operations-management","display_name":"Operations management","score":0.14449933171272278},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.13125726580619812},{"id":"https://openalex.org/keywords/economics","display_name":"Economics","score":0.09031862020492554}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.79632169008255},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.7542445659637451},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.63127601146698},{"id":"https://openalex.org/C16320812","wikidata":"https://www.wikidata.org/wiki/Q1812200","display_name":"Idle","level":2,"score":0.6251934170722961},{"id":"https://openalex.org/C176777502","wikidata":"https://www.wikidata.org/wiki/Q4774623","display_name":"Anticipation (artificial intelligence)","level":2,"score":0.5906316041946411},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5768155455589294},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.4662109315395355},{"id":"https://openalex.org/C29202148","wikidata":"https://www.wikidata.org/wiki/Q287260","display_name":"Resource allocation","level":2,"score":0.41408833861351013},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3907303214073181},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.384623646736145},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.2264363169670105},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.1571153700351715},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.15662220120429993},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.14449933171272278},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.13125726580619812},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.09031862020492554},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3620678.3624669","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3620678.3624669","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3620678.3624669","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2023 ACM Symposium on Cloud Computing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3620678.3624669","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3620678.3624669","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3620678.3624669","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2023 ACM Symposium on Cloud Computing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1452086972","display_name":null,"funder_award_id":"JUMP 2.0","funder_id":"https://openalex.org/F4320306087","funder_display_name":"Semiconductor Research Corporation"}],"funders":[{"id":"https://openalex.org/F4320306087","display_name":"Semiconductor Research Corporation","ror":"https://ror.org/047z4n946"},{"id":"https://openalex.org/F4320307791","display_name":"Cisco Systems","ror":"https://ror.org/03yt1ez60"},{"id":"https://openalex.org/F4320316785","display_name":"VMware","ror":null},{"id":"https://openalex.org/F4320332180","display_name":"Defense Advanced Research Projects Agency","ror":"https://ror.org/02caytj08"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4388076418.pdf","grobid_xml":"https://content.openalex.org/works/W4388076418.grobid-xml"},"referenced_works_count":25,"referenced_works":["https://openalex.org/W95608104","https://openalex.org/W2075233755","https://openalex.org/W2078098553","https://openalex.org/W2149273293","https://openalex.org/W2149939304","https://openalex.org/W2217174568","https://openalex.org/W2295598076","https://openalex.org/W2546571074","https://openalex.org/W2734941459","https://openalex.org/W2764100055","https://openalex.org/W2798515322","https://openalex.org/W2798933012","https://openalex.org/W2914209329","https://openalex.org/W2949676527","https://openalex.org/W2962725887","https://openalex.org/W3004103998","https://openalex.org/W3015423804","https://openalex.org/W3022298203","https://openalex.org/W3128615300","https://openalex.org/W3138327474","https://openalex.org/W3158545849","https://openalex.org/W3163287424","https://openalex.org/W3197816522","https://openalex.org/W4213358906","https://openalex.org/W4313160155"],"related_works":["https://openalex.org/W17155033","https://openalex.org/W3207760230","https://openalex.org/W1496222301","https://openalex.org/W1590307681","https://openalex.org/W4312814274","https://openalex.org/W4285370786","https://openalex.org/W2296488620","https://openalex.org/W2358353312","https://openalex.org/W2353836703","https://openalex.org/W41015297"],"abstract_inverted_index":{"Our":[0],"analysis":[1],"of":[2,56,124,135,140],"a":[3,25,102],"large":[4],"public":[5],"cloud":[6],"ML":[7],"training":[8],"service":[9],"shows":[10],"that":[11,69,87,108,145],"resources":[12,20,37],"remain":[13],"unused":[14],"likely":[15],"because":[16],"users":[17],"statically":[18],"(over-)allocate":[19],"for":[21,27],"their":[22],"jobs":[23,43],"given":[24],"desire":[26],"predictable":[28],"performance,":[29],"and":[30,60,80],"state-of-the-art":[31],"schedulers":[32,144],"do":[33,88],"not":[34,89],"exploit":[35],"idle":[36],"lest":[38],"they":[39],"slow":[40],"down":[41],"some":[42],"excessively.":[44],"We":[45,67,100],"consider":[46],"if":[47],"an":[48,122],"anticipatory":[49,104],"scheduler,":[50,130],"which":[51],"schedules":[52],"based":[53],"on":[54],"predictions":[55],"future":[57],"job":[58,119,136],"arrivals":[59],"durations,":[61],"can":[62],"improve":[63],"over":[64,126],"the":[65,82,127,133],"state-of-the-art.":[66],"find":[68],"realizing":[70],"gains":[71],"from":[72],"anticipation":[73],"requires":[74],"dealing":[75],"effectively":[76],"with":[77],"prediction":[78],"errors,":[79],"even":[81],"best":[83],"predictors":[84],"have":[85],"errors":[86],"conform":[90],"to":[91,111,143],"simple":[92],"models":[93],"(such":[94],"as":[95],"bounded":[96],"or":[97],"i.i.d.":[98],"error).":[99],"devise":[101],"novel":[103],"scheduler":[105],"called":[106],"SIA":[107,117],"is":[109],"robust":[110],"such":[112],"errors.":[113],"On":[114],"real":[115],"workloads,":[116],"reduces":[118],"latency":[120],"by":[121,138],"average":[123],"2.83\u00d7":[125],"current":[128],"production":[129],"while":[131],"reducing":[132],"likelihood":[134],"slowdowns":[137],"orders":[139],"magnitude":[141],"relative":[142],"na\u00efvely":[146],"share":[147],"resources.":[148]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":2}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
