{"id":"https://openalex.org/W4383899813","doi":"https://doi.org/10.1109/tpds.2023.3293835","title":"DeepBoot: Dynamic Scheduling System for Training and Inference Deep Learning Tasks in GPU Cluster","display_name":"DeepBoot: Dynamic Scheduling System for Training and Inference Deep Learning Tasks in GPU Cluster","publication_year":2023,"publication_date":"2023-07-11","ids":{"openalex":"https://openalex.org/W4383899813","doi":"https://doi.org/10.1109/tpds.2023.3293835"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2023.3293835","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2023.3293835","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5055060853","display_name":"Zhenqian Chen","orcid":"https://orcid.org/0000-0001-7519-2295"},"institutions":[{"id":"https://openalex.org/I168879160","display_name":"Zhejiang University of Science and Technology","ror":"https://ror.org/05mx0wr29","country_code":"CN","type":"education","lineage":["https://openalex.org/I168879160"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhenqian Chen","raw_affiliation_strings":["College of Computer Science and Technology, Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-7519-2295","affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I168879160"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101765691","display_name":"Xinkui Zhao","orcid":"https://orcid.org/0000-0002-1115-5652"},"institutions":[{"id":"https://openalex.org/I109935558","display_name":"Ningbo University","ror":"https://ror.org/03et85d35","country_code":"CN","type":"education","lineage":["https://openalex.org/I109935558"]},{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinkui Zhao","raw_affiliation_strings":["School of Software Technology, Zhejiang University, Ningbo, China"],"raw_orcid":"https://orcid.org/0000-0002-1115-5652","affiliations":[{"raw_affiliation_string":"School of Software Technology, Zhejiang University, Ningbo, China","institution_ids":["https://openalex.org/I109935558","https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075513322","display_name":"Chen Zhi","orcid":"https://orcid.org/0000-0002-1273-2992"},"institutions":[{"id":"https://openalex.org/I109935558","display_name":"Ningbo University","ror":"https://ror.org/03et85d35","country_code":"CN","type":"education","lineage":["https://openalex.org/I109935558"]},{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chen Zhi","raw_affiliation_strings":["School of Software Technology, Zhejiang University, Ningbo, China"],"raw_orcid":"https://orcid.org/0000-0002-1273-2992","affiliations":[{"raw_affiliation_string":"School of Software Technology, Zhejiang University, Ningbo, China","institution_ids":["https://openalex.org/I109935558","https://openalex.org/I76130692"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5069353502","display_name":"Jianwei Yin","orcid":"https://orcid.org/0000-0003-4703-7348"},"institutions":[{"id":"https://openalex.org/I168879160","display_name":"Zhejiang University of Science and Technology","ror":"https://ror.org/05mx0wr29","country_code":"CN","type":"education","lineage":["https://openalex.org/I168879160"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianwei Yin","raw_affiliation_strings":["College of Computer Science and Technology, Zhejiang University, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-4703-7348","affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I168879160"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5055060853"],"corresponding_institution_ids":["https://openalex.org/I168879160"],"apc_list":null,"apc_paid":null,"fwci":1.6485,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.86066665,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":"34","issue":"9","first_page":"2553","last_page":"2567"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9962000250816345,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8193935751914978},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7525084018707275},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.5454994440078735},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5243784785270691},{"id":"https://openalex.org/keywords/gpu-cluster","display_name":"GPU cluster","score":0.49981188774108887},{"id":"https://openalex.org/keywords/inference-engine","display_name":"Inference engine","score":0.4701659381389618},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.4488058388233185},{"id":"https://openalex.org/keywords/computer-cluster","display_name":"Computer cluster","score":0.43489158153533936},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.39383596181869507},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3552544116973877},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.16523540019989014},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.0955890417098999}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8193935751914978},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7525084018707275},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.5454994440078735},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5243784785270691},{"id":"https://openalex.org/C2781335571","wikidata":"https://www.wikidata.org/wiki/Q2633544","display_name":"GPU cluster","level":3,"score":0.49981188774108887},{"id":"https://openalex.org/C46743427","wikidata":"https://www.wikidata.org/wiki/Q1341685","display_name":"Inference engine","level":3,"score":0.4701659381389618},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.4488058388233185},{"id":"https://openalex.org/C29140674","wikidata":"https://www.wikidata.org/wiki/Q206637","display_name":"Computer cluster","level":2,"score":0.43489158153533936},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.39383596181869507},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3552544116973877},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.16523540019989014},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.0955890417098999},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpds.2023.3293835","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2023.3293835","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4699999988079071,"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8"}],"awards":[{"id":"https://openalex.org/G1977736942","display_name":null,"funder_award_id":"61825205","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":68,"referenced_works":["https://openalex.org/W95152782","https://openalex.org/W2031489346","https://openalex.org/W2072803231","https://openalex.org/W2105947650","https://openalex.org/W2108598243","https://openalex.org/W2126105956","https://openalex.org/W2193413348","https://openalex.org/W2194775991","https://openalex.org/W2219888463","https://openalex.org/W2290712622","https://openalex.org/W2605350416","https://openalex.org/W2794670651","https://openalex.org/W2798515322","https://openalex.org/W2896457183","https://openalex.org/W2899071864","https://openalex.org/W2919594608","https://openalex.org/W2953384591","https://openalex.org/W2963403751","https://openalex.org/W2963748441","https://openalex.org/W2964108773","https://openalex.org/W3022298203","https://openalex.org/W3047565185","https://openalex.org/W3096484587","https://openalex.org/W3096583839","https://openalex.org/W3097904259","https://openalex.org/W3115029474","https://openalex.org/W3118608800","https://openalex.org/W3130934537","https://openalex.org/W3134991928","https://openalex.org/W3154142431","https://openalex.org/W3159050454","https://openalex.org/W3168584267","https://openalex.org/W3175707606","https://openalex.org/W3177263144","https://openalex.org/W3179122174","https://openalex.org/W3191069914","https://openalex.org/W3197816522","https://openalex.org/W3209503812","https://openalex.org/W4221141126","https://openalex.org/W4283204791","https://openalex.org/W4289300273","https://openalex.org/W4292779060","https://openalex.org/W4293584584","https://openalex.org/W4295312788","https://openalex.org/W6603838645","https://openalex.org/W6687483927","https://openalex.org/W6687566353","https://openalex.org/W6713134421","https://openalex.org/W6730956707","https://openalex.org/W6750227808","https://openalex.org/W6755207826","https://openalex.org/W6756009870","https://openalex.org/W6756592394","https://openalex.org/W6758283263","https://openalex.org/W6759814162","https://openalex.org/W6766978945","https://openalex.org/W6778883912","https://openalex.org/W6779103662","https://openalex.org/W6781728138","https://openalex.org/W6782839094","https://openalex.org/W6785197036","https://openalex.org/W6787673396","https://openalex.org/W6787972765","https://openalex.org/W6793903029","https://openalex.org/W6796539980","https://openalex.org/W6798686915","https://openalex.org/W6809572605","https://openalex.org/W7067822191"],"related_works":["https://openalex.org/W2057057690","https://openalex.org/W2368184788","https://openalex.org/W2358964818","https://openalex.org/W2359535128","https://openalex.org/W2381332051","https://openalex.org/W2321443665","https://openalex.org/W2375699995","https://openalex.org/W2043719711","https://openalex.org/W2140069467","https://openalex.org/W2364072231"],"abstract_inverted_index":{"Deep":[0],"learning":[1,148],"tasks":[2,23],"(DLT)":[3],"include":[4],"training":[5,10,37,48,82,100,105,124],"and":[6,21,38,101,107,142,156],"inference":[7,22,39,55,66,78,102,114,134,172],"tasks,":[8],"where":[9],"DLTs":[11,49,106],"have":[12],"requirements":[13],"on":[14,126,139],"minimizing":[15],"average":[16,158],"job":[17],"completion":[18],"time":[19],"(JCT)":[20],"need":[24],"sufficient":[25],"GPUs":[26,52,75,97,169],"to":[27,43,63,95,128],"meet":[28],"real-time":[29],"performance.":[30],"Unfortunately,":[31],"existing":[32],"work":[33],"separately":[34],"deploys":[35],"multi-tenant":[36],"GPU":[40,60,135],"cluster,":[41],"leading":[42],"the":[44,54,64,70,77,81,99,109,130,140,164,171],"high":[45],"JCT":[46,159],"of":[47],"with":[50,163],"limited":[51],"when":[53,112],"cluster":[56,79],"is":[57],"under":[58],"insufficient":[59],"utilization":[61],"due":[62],"periodic":[65],"workload.":[67],"DeepBoot":[68,86,117,152],"solves":[69],"challenges":[71],"by":[72,133],"utilizing":[73,167],"idle":[74,168],"in":[76,98,145,170],"for":[80,104],"DLTs.":[83],"Specifically,":[84],"1)":[85],"designs":[87],"<italic":[88,119],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[89,120],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">adaptive":[90],"task":[91],"scaling</i>":[92],"(ATS)":[93],"algorithm":[94],"allocate":[96],"clusters":[103],"minimize":[108],"performance":[110],"loss":[111],"reclaiming":[113],"GPUs.":[115],"2)":[116],"implements":[118],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">auto-fast":[121],"elastic</i>":[122],"(AFE)":[123],"based":[125],"Pollux":[127],"reduce":[129],"restart":[131],"overhead":[132],"reclaiming.":[136],"Our":[137],"implementation":[138],"testbed":[141],"large-scale":[143],"simulation":[144],"Microsoft":[146],"deep":[147],"workload":[149],"shows":[150],"that":[151],"can":[153],"achieve":[154],"32%":[155],"38%":[157],"reduction":[160],"respectively":[161],"compared":[162],"scheduler":[165],"without":[166],"cluster.":[173]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":8},{"year":2024,"cited_by_count":5}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
