{"id":"https://openalex.org/W3206418100","doi":"https://doi.org/10.1145/3458817.3480859","title":"Online evolutionary batch size orchestration for scheduling deep learning workloads in GPU clusters","display_name":"Online evolutionary batch size orchestration for scheduling deep learning workloads in GPU clusters","publication_year":2021,"publication_date":"2021-10-21","ids":{"openalex":"https://openalex.org/W3206418100","doi":"https://doi.org/10.1145/3458817.3480859","mag":"3206418100"},"language":"en","primary_location":{"id":"doi:10.1145/3458817.3480859","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3458817.3480859","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3458817.3480859","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3458817.3480859","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5011091722","display_name":"Zhengda Bian","orcid":"https://orcid.org/0000-0002-1906-1781"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Zhengda Bian","raw_affiliation_strings":["National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050734153","display_name":"Shenggui Li","orcid":null},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Shenggui Li","raw_affiliation_strings":["National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100392222","display_name":"Wei Wang","orcid":"https://orcid.org/0000-0003-2262-2508"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei Wang","raw_affiliation_strings":["ByteDance, Singapore"],"affiliations":[{"raw_affiliation_string":"ByteDance, Singapore","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100658705","display_name":"Yang You","orcid":"https://orcid.org/0000-0003-2816-4384"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Yang You","raw_affiliation_strings":["National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5011091722"],"corresponding_institution_ids":["https://openalex.org/I165932596"],"apc_list":null,"apc_paid":null,"fwci":4.5331,"has_fulltext":true,"cited_by_count":21,"citation_normalized_percentile":{"value":0.9500954,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"15"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.995199978351593,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8346114158630371},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.6494609117507935},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5998079180717468},{"id":"https://openalex.org/keywords/job-scheduler","display_name":"Job scheduler","score":0.5524140000343323},{"id":"https://openalex.org/keywords/elasticity","display_name":"Elasticity (physics)","score":0.5285483598709106},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.5046433210372925},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.47921377420425415},{"id":"https://openalex.org/keywords/job-shop-scheduling","display_name":"Job shop scheduling","score":0.45237764716148376},{"id":"https://openalex.org/keywords/orchestration","display_name":"Orchestration","score":0.43229228258132935},{"id":"https://openalex.org/keywords/tardiness","display_name":"Tardiness","score":0.4283120930194855},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.41316381096839905},{"id":"https://openalex.org/keywords/execution-time","display_name":"Execution time","score":0.4120243191719055},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4110889434814453},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.40279626846313477},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.22021156549453735},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.20666271448135376},{"id":"https://openalex.org/keywords/schedule","display_name":"Schedule","score":0.17645886540412903},{"id":"https://openalex.org/keywords/mathematical-optimization","display_name":"Mathematical optimization","score":0.15661892294883728}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8346114158630371},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.6494609117507935},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5998079180717468},{"id":"https://openalex.org/C111873713","wikidata":"https://www.wikidata.org/wiki/Q1641413","display_name":"Job scheduler","level":3,"score":0.5524140000343323},{"id":"https://openalex.org/C121854251","wikidata":"https://www.wikidata.org/wiki/Q62932","display_name":"Elasticity (physics)","level":2,"score":0.5285483598709106},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5046433210372925},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.47921377420425415},{"id":"https://openalex.org/C55416958","wikidata":"https://www.wikidata.org/wiki/Q6206757","display_name":"Job shop scheduling","level":3,"score":0.45237764716148376},{"id":"https://openalex.org/C199168358","wikidata":"https://www.wikidata.org/wiki/Q3367000","display_name":"Orchestration","level":3,"score":0.43229228258132935},{"id":"https://openalex.org/C2778047078","wikidata":"https://www.wikidata.org/wiki/Q82299449","display_name":"Tardiness","level":4,"score":0.4283120930194855},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.41316381096839905},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.4120243191719055},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4110889434814453},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.40279626846313477},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.22021156549453735},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.20666271448135376},{"id":"https://openalex.org/C68387754","wikidata":"https://www.wikidata.org/wiki/Q7271585","display_name":"Schedule","level":2,"score":0.17645886540412903},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.15661892294883728},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0},{"id":"https://openalex.org/C153349607","wikidata":"https://www.wikidata.org/wiki/Q36649","display_name":"Visual arts","level":1,"score":0.0},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3458817.3480859","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3458817.3480859","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3458817.3480859","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3458817.3480859","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3458817.3480859","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3458817.3480859","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.550000011920929,"id":"https://metadata.un.org/sdg/8","display_name":"Decent work and economic growth"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320331518","display_name":"Centro Svizzero di Calcolo Scientifico","ror":null},{"id":"https://openalex.org/F4320331617","display_name":"National Supercomputing Centre Singapore","ror":null}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3206418100.pdf","grobid_xml":"https://content.openalex.org/works/W3206418100.grobid-xml"},"referenced_works_count":20,"referenced_works":["https://openalex.org/W2001265733","https://openalex.org/W2097117768","https://openalex.org/W2141992894","https://openalex.org/W2183341477","https://openalex.org/W2194775991","https://openalex.org/W2546571074","https://openalex.org/W2617242334","https://openalex.org/W2798515322","https://openalex.org/W2914847013","https://openalex.org/W2945696849","https://openalex.org/W2949562459","https://openalex.org/W2962684017","https://openalex.org/W2962725887","https://openalex.org/W2962747323","https://openalex.org/W2963433233","https://openalex.org/W2964054038","https://openalex.org/W3162118826","https://openalex.org/W4234552385","https://openalex.org/W4242580520","https://openalex.org/W4249033934"],"related_works":["https://openalex.org/W2011810134","https://openalex.org/W2050497389","https://openalex.org/W1568976062","https://openalex.org/W2960025371","https://openalex.org/W4285804621","https://openalex.org/W2842402481","https://openalex.org/W2098270445","https://openalex.org/W3017351265","https://openalex.org/W4281719965","https://openalex.org/W1511549656"],"abstract_inverted_index":{"Efficient":[0],"GPU":[1,24,27,90],"resource":[2,8],"scheduling":[3,94,114],"is":[4],"essential":[5],"to":[6,34,51,88],"maximize":[7,89],"utilization":[9,91],"and":[10,92],"save":[11],"training":[12,83],"costs":[13],"for":[14,67,101],"the":[15,36,52,58,75,82,98,113,118,136],"increasing":[16],"amount":[17],"of":[18,39,54,77,120],"deep":[19,40,138],"learning":[20,41,139],"workloads":[21],"in":[22],"shared":[23],"clusters.":[25],"Existing":[26],"schedulers":[28,140],"largely":[29],"rely":[30],"on":[31,81,125],"static":[32],"policies":[33],"leverage":[35],"performance":[37],"characteristics":[38],"jobs.":[42],"However,":[43],"they":[44],"can":[45,110,134],"hardly":[46],"reach":[47],"optimal":[48],"efficiency":[49],"due":[50],"lack":[53],"elasticity.":[55],"To":[56],"address":[57],"problem,":[59],"we":[60],"propose":[61],"ONES,":[62],"an":[63,105],"ONline":[64],"Evolutionary":[65],"Scheduler":[66],"elastic":[68],"batch":[69,84,99],"size":[70,100],"orchestration.":[71],"ONES":[72,121,133],"automatically":[73],"manages":[74],"elasticity":[76],"each":[78,102],"job":[79,103,146],"based":[80],"size,":[85],"so":[86],"as":[87],"improve":[93],"efficiency.":[95],"It":[96],"determines":[97],"through":[104],"online":[106],"evolutionary":[107],"search":[108],"that":[109,132],"continuously":[111],"optimize":[112],"decisions.":[115],"We":[116],"evaluate":[117],"effectiveness":[119],"with":[122,141],"64":[123],"GPUs":[124],"TACC's":[126],"Longhorn":[127],"supercomputers.":[128],"The":[129],"results":[130],"show":[131],"outperform":[135],"prior":[137],"a":[142],"significantly":[143],"shorter":[144],"average":[145],"completion":[147],"time.":[148]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":6},{"year":2022,"cited_by_count":4}],"updated_date":"2026-03-09T08:58:05.943551","created_date":"2025-10-10T00:00:00"}
