{"id":"https://openalex.org/W7123703360","doi":"https://doi.org/10.1145/3772052.3772257","title":"Hybrid Learning and Optimization-Based Dynamic Scheduling for DL Workloads on Heterogeneous GPU Clusters","display_name":"Hybrid Learning and Optimization-Based Dynamic Scheduling for DL Workloads on Heterogeneous GPU Clusters","publication_year":2025,"publication_date":"2025-11-19","ids":{"openalex":"https://openalex.org/W7123703360","doi":"https://doi.org/10.1145/3772052.3772257"},"language":"en","primary_location":{"id":"doi:10.1145/3772052.3772257","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3772052.3772257","pdf_url":null,"source":null,"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM Symposium on Cloud Computing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3772052.3772257","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5044629756","display_name":"Shruti Dongare","orcid":null},"institutions":[{"id":"https://openalex.org/I859038795","display_name":"Virginia Tech","ror":"https://ror.org/02smfhw86","country_code":"US","type":"education","lineage":["https://openalex.org/I859038795"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shruti Dongare","raw_affiliation_strings":["Virginia Tech, Blacksburg, Virginia, USA"],"raw_orcid":"https://orcid.org/0009-0007-6148-9953","affiliations":[{"raw_affiliation_string":"Virginia Tech, Blacksburg, Virginia, USA","institution_ids":["https://openalex.org/I859038795"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078335469","display_name":"Redwan Ibne Seraj Khan","orcid":"https://orcid.org/0000-0003-3228-6384"},"institutions":[{"id":"https://openalex.org/I4210137942","display_name":"D-Tech (United States)","ror":"https://ror.org/03tw6b878","country_code":"US","type":"company","lineage":["https://openalex.org/I4210137942"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Redwan Ibne Seraj Khan","raw_affiliation_strings":["Virginia Tech, Seattle, USA"],"raw_orcid":"https://orcid.org/0000-0003-3228-6384","affiliations":[{"raw_affiliation_string":"Virginia Tech, Seattle, USA","institution_ids":["https://openalex.org/I4210137942"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083817094","display_name":"Hadeel Albahar","orcid":"https://orcid.org/0000-0002-4732-2707"},"institutions":[{"id":"https://openalex.org/I36721946","display_name":"Kuwait University","ror":"https://ror.org/021e5j056","country_code":"KW","type":"education","lineage":["https://openalex.org/I36721946"]}],"countries":["KW"],"is_corresponding":false,"raw_author_name":"Hadeel Albahar","raw_affiliation_strings":["Kuwait University, Kuwait City, Kuwait"],"raw_orcid":"https://orcid.org/0000-0002-4732-2707","affiliations":[{"raw_affiliation_string":"Kuwait University, Kuwait City, Kuwait","institution_ids":["https://openalex.org/I36721946"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122933334","display_name":"Nannan Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Nannan Zhao","raw_affiliation_strings":["Northwestern Polytechnical University, China, Xi'an, China"],"raw_orcid":"https://orcid.org/0000-0001-6059-1154","affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University, China, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122914814","display_name":"Diego Mel\u00e9ndez-Maita","orcid":null},"institutions":[{"id":"https://openalex.org/I859038795","display_name":"Virginia Tech","ror":"https://ror.org/02smfhw86","country_code":"US","type":"education","lineage":["https://openalex.org/I859038795"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Diego Mel\u00e9ndez-Maita","raw_affiliation_strings":["Virginia Tech, Blacksburg, USA"],"raw_orcid":"https://orcid.org/0009-0003-7385-9879","affiliations":[{"raw_affiliation_string":"Virginia Tech, Blacksburg, USA","institution_ids":["https://openalex.org/I859038795"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5045271329","display_name":"A.E. Butt","orcid":null},"institutions":[{"id":"https://openalex.org/I859038795","display_name":"Virginia Tech","ror":"https://ror.org/02smfhw86","country_code":"US","type":"education","lineage":["https://openalex.org/I859038795"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ali R. Butt","raw_affiliation_strings":["Virginia Tech, Blacksburg, Virginia, USA"],"raw_orcid":"https://orcid.org/0000-0002-0871-7263","affiliations":[{"raw_affiliation_string":"Virginia Tech, Blacksburg, Virginia, USA","institution_ids":["https://openalex.org/I859038795"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.75365597,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"557","last_page":"570"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.5738000273704529,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.5738000273704529,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.08259999752044678,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.07660000026226044,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.7096999883651733},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.7067999839782715},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.6029999852180481},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.5705000162124634},{"id":"https://openalex.org/keywords/prioritization","display_name":"Prioritization","score":0.5293999910354614},{"id":"https://openalex.org/keywords/queueing-theory","display_name":"Queueing theory","score":0.5282999873161316},{"id":"https://openalex.org/keywords/profiling","display_name":"Profiling (computer programming)","score":0.3675000071525574},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.33880001306533813},{"id":"https://openalex.org/keywords/computation-offloading","display_name":"Computation offloading","score":0.3276999890804291}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8565999865531921},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.7096999883651733},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.7067999839782715},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.650600016117096},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.6029999852180481},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.5705000162124634},{"id":"https://openalex.org/C2777615720","wikidata":"https://www.wikidata.org/wiki/Q11888847","display_name":"Prioritization","level":2,"score":0.5293999910354614},{"id":"https://openalex.org/C22684755","wikidata":"https://www.wikidata.org/wiki/Q847526","display_name":"Queueing theory","level":2,"score":0.5282999873161316},{"id":"https://openalex.org/C187191949","wikidata":"https://www.wikidata.org/wiki/Q1138496","display_name":"Profiling (computer programming)","level":2,"score":0.3675000071525574},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.33880001306533813},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3276999890804291},{"id":"https://openalex.org/C2781041963","wikidata":"https://www.wikidata.org/wiki/Q18348618","display_name":"Computation offloading","level":4,"score":0.3276999890804291},{"id":"https://openalex.org/C172430144","wikidata":"https://www.wikidata.org/wiki/Q17111997","display_name":"Symmetric multiprocessor system","level":2,"score":0.31769999861717224},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.30000001192092896},{"id":"https://openalex.org/C55416958","wikidata":"https://www.wikidata.org/wiki/Q6206757","display_name":"Job shop scheduling","level":3,"score":0.2928999960422516},{"id":"https://openalex.org/C111873713","wikidata":"https://www.wikidata.org/wiki/Q1641413","display_name":"Job scheduler","level":3,"score":0.28790000081062317},{"id":"https://openalex.org/C2992525071","wikidata":"https://www.wikidata.org/wiki/Q50818671","display_name":"Federated learning","level":2,"score":0.28600001335144043},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2840000092983246},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.2806999981403351},{"id":"https://openalex.org/C123403432","wikidata":"https://www.wikidata.org/wiki/Q654068","display_name":"Visibility","level":2,"score":0.27410000562667847},{"id":"https://openalex.org/C126831891","wikidata":"https://www.wikidata.org/wiki/Q221673","display_name":"Host (biology)","level":2,"score":0.2630999982357025},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.2551000118255615},{"id":"https://openalex.org/C158207573","wikidata":"https://www.wikidata.org/wiki/Q5747224","display_name":"Heterogeneous network","level":4,"score":0.2542000114917755},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2540000081062317},{"id":"https://openalex.org/C45942800","wikidata":"https://www.wikidata.org/wiki/Q245652","display_name":"Ensemble learning","level":2,"score":0.2524000108242035},{"id":"https://openalex.org/C76518257","wikidata":"https://www.wikidata.org/wiki/Q271680","display_name":"Software framework","level":5,"score":0.25130000710487366}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3772052.3772257","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3772052.3772257","pdf_url":null,"source":null,"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM Symposium on Cloud Computing","raw_type":"proceedings-article"},{"id":"pmh:oai:vtechworks.lib.vt.edu:10919/141117","is_oa":false,"landing_page_url":"https://hdl.handle.net/10919/141117","pdf_url":null,"source":{"id":"https://openalex.org/S4306400248","display_name":"VTechWorks (Virginia Tech)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I859038795","host_organization_name":"Virginia Tech","host_organization_lineage":["https://openalex.org/I859038795"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":null,"raw_type":"Text"}],"best_oa_location":{"id":"doi:10.1145/3772052.3772257","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3772052.3772257","pdf_url":null,"source":null,"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM Symposium on Cloud Computing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2890016880","display_name":null,"funder_award_id":"CSR-2106634, CSR-2312785","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G4057393648","display_name":null,"funder_award_id":"Grant No. 62202382","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":15,"referenced_works":["https://openalex.org/W2147598193","https://openalex.org/W2749122933","https://openalex.org/W2767921504","https://openalex.org/W2798515322","https://openalex.org/W2900465450","https://openalex.org/W3022298203","https://openalex.org/W3129362935","https://openalex.org/W3133465160","https://openalex.org/W3197816522","https://openalex.org/W4200184446","https://openalex.org/W4372261197","https://openalex.org/W4372262787","https://openalex.org/W4380047697","https://openalex.org/W4387321109","https://openalex.org/W4388662057"],"related_works":[],"abstract_inverted_index":{"Modern":[0],"cloud":[1,137],"platforms":[2],"increasingly":[3],"host":[4],"large-scale":[5,89],"deep":[6],"learning":[7,49],"(DL)":[8],"workloads,":[9],"demanding":[10],"high-throughput,":[11],"low-latency":[12],"GPU":[13,20,62,100],"scheduling.":[14],"However,":[15],"the":[16],"growing":[17],"heterogeneity":[18],"of":[19],"clusters":[21],"and":[22,56,84,96,113,147],"limited":[23],"visibility":[24],"into":[25],"application":[26],"characteristics":[27],"pose":[28],"major":[29],"challenges":[30],"for":[31,136,143],"existing":[32],"schedulers,":[33],"which":[34],"often":[35],"rely":[36],"on":[37,60,88],"offline":[38],"profiling":[39],"or":[40],"application-specific":[41],"assumptions.":[42],"We":[43],"present":[44],"RLTune,":[45],"an":[46],"application-agnostic":[47],"reinforcement":[48],"(RL)-based":[50],"scheduling":[51],"framework":[52],"that":[53],"dynamically":[54],"prioritizes":[55],"allocates":[57],"DL":[58,149],"jobs":[59],"heterogeneous":[61],"clusters.":[63],"RLTune":[64,98,124],"integrates":[65],"RL-driven":[66],"prioritization":[67],"with":[68],"MILP-based":[69],"job-to-node":[70],"mapping":[71],"to":[72,104,111,139],"optimize":[73],"system-wide":[74],"objectives":[75],"such":[76],"as":[77,117,119],"job":[78],"completion":[79],"time":[80],"(JCT),":[81],"queueing":[82,107],"delay,":[83],"resource":[85],"utilization.":[86],"Trained":[87],"production":[90],"traces":[91],"from":[92],"Microsoft":[93],"Philly,":[94],"Helios,":[95],"Alibaba,":[97],"improves":[99],"utilization":[101],"by":[102,109,116],"up":[103,110],"20%,":[105],"reduces":[106],"delay":[108],"81%,":[112],"shortens":[114],"JCT":[115],"much":[118],"70%.":[120],"Unlike":[121],"prior":[122],"approaches,":[123],"generalizes":[125],"across":[126],"diverse":[127],"workloads":[128],"without":[129],"requiring":[130],"per-job":[131],"profiling,":[132],"making":[133],"it":[134],"practical":[135],"providers":[138],"deploy":[140],"at":[141],"scale":[142],"more":[144],"efficient,":[145],"fair,":[146],"sustainable":[148],"workload":[150],"management.":[151]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-01-14T00:00:00"}
