{"id":"https://openalex.org/W4386260503","doi":"https://doi.org/10.1109/infocom53939.2023.10228920","title":"Dynamic Resource Allocation for Deep Learning Clusters with Separated Compute and Storage","display_name":"Dynamic Resource Allocation for Deep Learning Clusters with Separated Compute and Storage","publication_year":2023,"publication_date":"2023-05-17","ids":{"openalex":"https://openalex.org/W4386260503","doi":"https://doi.org/10.1109/infocom53939.2023.10228920"},"language":"en","primary_location":{"id":"doi:10.1109/infocom53939.2023.10228920","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/infocom53939.2023.10228920","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE INFOCOM 2023 - IEEE Conference on Computer Communications","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5028976297","display_name":"Mingxia Li","orcid":"https://orcid.org/0000-0001-7329-321X"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Mingxia Li","raw_affiliation_strings":["University of Science and Technology of China,CAS Key Lab of Wireless-Optical Communications,China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,CAS Key Lab of Wireless-Optical Communications,China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049750274","display_name":"Zhenhua Han","orcid":"https://orcid.org/0000-0002-2880-7100"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenhua Han","raw_affiliation_strings":["Microsoft Research Asia,China"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Asia,China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100458193","display_name":"Chi Zhang","orcid":"https://orcid.org/0000-0002-6528-1427"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chi Zhang","raw_affiliation_strings":["University of Science and Technology of China,CAS Key Lab of Wireless-Optical Communications,China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,CAS Key Lab of Wireless-Optical Communications,China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024396134","display_name":"Ruiting Zhou","orcid":"https://orcid.org/0000-0001-9681-6482"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruiting Zhou","raw_affiliation_strings":["Southeast University,Nanjing,China"],"affiliations":[{"raw_affiliation_string":"Southeast University,Nanjing,China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111031758","display_name":"Yuanchi Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuanchi Liu","raw_affiliation_strings":["University of Science and Technology of China,CAS Key Lab of Wireless-Optical Communications,China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,CAS Key Lab of Wireless-Optical Communications,China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5067465324","display_name":"Haisheng Tan","orcid":"https://orcid.org/0000-0002-3133-1430"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haisheng Tan","raw_affiliation_strings":["University of Science and Technology of China,CAS Key Lab of Wireless-Optical Communications,China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China,CAS Key Lab of Wireless-Optical Communications,China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5028976297"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.09971767,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12702","display_name":"Brain Tumor Detection and Classification","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/2808","display_name":"Neurology"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7686728239059448},{"id":"https://openalex.org/keywords/resource-allocation","display_name":"Resource allocation","score":0.6814618110656738},{"id":"https://openalex.org/keywords/resource-management","display_name":"Resource management (computing)","score":0.46069878339767456},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.4571509063243866},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4267085790634155},{"id":"https://openalex.org/keywords/cluster","display_name":"Cluster (spacecraft)","score":0.4113163948059082},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.3848229646682739},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.22437164187431335}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7686728239059448},{"id":"https://openalex.org/C29202148","wikidata":"https://www.wikidata.org/wiki/Q287260","display_name":"Resource allocation","level":2,"score":0.6814618110656738},{"id":"https://openalex.org/C2780609101","wikidata":"https://www.wikidata.org/wiki/Q17156588","display_name":"Resource management (computing)","level":2,"score":0.46069878339767456},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.4571509063243866},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4267085790634155},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.4113163948059082},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3848229646682739},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.22437164187431335}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/infocom53939.2023.10228920","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/infocom53939.2023.10228920","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE INFOCOM 2023 - IEEE Conference on Computer Communications","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W95608104","https://openalex.org/W1984309565","https://openalex.org/W2108598243","https://openalex.org/W2183341477","https://openalex.org/W2194775991","https://openalex.org/W2504213755","https://openalex.org/W2524365899","https://openalex.org/W2896457183","https://openalex.org/W2899071864","https://openalex.org/W2910100551","https://openalex.org/W3047371394","https://openalex.org/W3047610867","https://openalex.org/W3091880889","https://openalex.org/W3138303811","https://openalex.org/W4372261197","https://openalex.org/W6697698479","https://openalex.org/W6728047685","https://openalex.org/W6735916004","https://openalex.org/W6754256767","https://openalex.org/W6755207826","https://openalex.org/W6756009870","https://openalex.org/W6759814162","https://openalex.org/W6774103200","https://openalex.org/W6781728138","https://openalex.org/W6782839094"],"related_works":["https://openalex.org/W2923452570","https://openalex.org/W206598027","https://openalex.org/W2978610750","https://openalex.org/W2138260944","https://openalex.org/W2022931285","https://openalex.org/W1589966275","https://openalex.org/W2086872282","https://openalex.org/W2137789903","https://openalex.org/W2138781885","https://openalex.org/W2153007255"],"abstract_inverted_index":{"The":[0],"separation":[1],"of":[2,13,20,73,86,98,122,145],"compute":[3,53],"and":[4,94,116,158,174,180,189],"storage":[5,39],"in":[6,51],"modern":[7],"cloud":[8],"services":[9],"eases":[10],"the":[11,18,52,57,63,77,82,106,129,142,193],"deployment":[12],"general":[14],"applications.":[15],"However,":[16],"with":[17,66,132,168],"development":[19],"accelerators":[21],"such":[22],"as":[23],"GPU/TPU,":[24],"Deep":[25],"Learning":[26],"(DL)":[27],"training":[28,43,112,130],"is":[29,71],"suffering":[30],"from":[31,38],"potential":[32],"IO":[33,64,178],"bottlenecks":[34],"when":[35],"loading":[36],"data":[37],"clusters.":[40],"Therefore,":[41],"DL":[42,87,99,197],"jobs":[44,93],"need":[45],"to":[46,55,75,81,127,150,152,187],"either":[47],"create":[48],"local":[49],"cache":[50],"cluster":[54],"reduce":[56,177],"bandwidth":[58,68],"demands":[59],"or":[60],"scale":[61,141],"up":[62,186],"capacity":[65],"higher":[67],"cost.":[69],"It":[70],"full":[72],"challenges":[74],"choose":[76],"best":[78],"strategy":[79],"due":[80],"heterogeneous":[83],"cache/IO":[84],"preference":[85],"models,":[88],"shared":[89],"dataset":[90,114],"among":[91],"multiple":[92],"dynamic":[95],"GPU":[96,120,143],"scaling":[97],"training.":[100,198],"In":[101],"this":[102],"work,":[103],"we":[104,124,147],"exploit":[105],"job":[107,155],"characteristics":[108],"based":[109],"on":[110],"their":[111],"throughput,":[113],"size":[115],"scalability.":[117],"For":[118,136],"fixed":[119],"allocation":[121],"jobs,":[123,146],"propose":[125],"CBA":[126,149,173],"minimize":[128],"cost":[131,179],"a":[133,163],"closed-form":[134],"approach.":[135],"clusters":[137],"that":[138,172],"can":[139,176],"automatically":[140],"allocations":[144],"extend":[148],"AutoCBA":[151,175],"support":[153],"diverse":[154],"utility":[156],"functions":[157],"improve":[159,181],"social":[160,183],"welfare":[161,184],"within":[162],"limited":[164],"budget.":[165],"Extensive":[166],"experiments":[167],"production":[169],"traces":[170],"validate":[171],"total":[182],"by":[185],"20.5%":[188],"2.27\u00d7,":[190],"respectively,":[191],"over":[192],"state-of-the-art":[194],"schedulers":[195],"for":[196]},"counts_by_year":[],"updated_date":"2025-12-23T23:11:35.936235","created_date":"2025-10-10T00:00:00"}
