{"id":"https://openalex.org/W7114921104","doi":"https://doi.org/10.1109/tc.2025.3642832","title":"Graph-Based Batch Job Load Balancing Scheduling for Multi-Dimensional Resources in Heterogeneous GPU Clusters","display_name":"Graph-Based Batch Job Load Balancing Scheduling for Multi-Dimensional Resources in Heterogeneous GPU Clusters","publication_year":2025,"publication_date":"2025-12-12","ids":{"openalex":"https://openalex.org/W7114921104","doi":"https://doi.org/10.1109/tc.2025.3642832"},"language":null,"primary_location":{"id":"doi:10.1109/tc.2025.3642832","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tc.2025.3642832","pdf_url":null,"source":{"id":"https://openalex.org/S157670870","display_name":"IEEE Transactions on Computers","issn_l":"0018-9340","issn":["0018-9340","1557-9956","2326-3814"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Computers","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Sheng Wang","orcid":"https://orcid.org/0000-0002-9980-6922"},"institutions":[{"id":"https://openalex.org/I148128674","display_name":"University of Shanghai for Science and Technology","ror":"https://ror.org/00ay9v204","country_code":"CN","type":"education","lineage":["https://openalex.org/I148128674"]},{"id":"https://openalex.org/I4210115456","display_name":"Chuzhou University","ror":"https://ror.org/037663q52","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210115456"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Sheng Wang","raw_affiliation_strings":["Business School, University of Shanghai for Science and Technology, Shanghai, China","School of Computer and Information Engineering, Chuzhou University, Chuzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-9980-6922","affiliations":[{"raw_affiliation_string":"Business School, University of Shanghai for Science and Technology, Shanghai, China","institution_ids":["https://openalex.org/I148128674"]},{"raw_affiliation_string":"School of Computer and Information Engineering, Chuzhou University, Chuzhou, China","institution_ids":["https://openalex.org/I4210115456"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yumei Shi","orcid":"https://orcid.org/0009-0006-3312-348X"},"institutions":[{"id":"https://openalex.org/I4210115456","display_name":"Chuzhou University","ror":"https://ror.org/037663q52","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210115456"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yumei Shi","raw_affiliation_strings":["School of Mathematics and Finance, Chuzhou University, Chuzhou, China"],"raw_orcid":"https://orcid.org/0009-0006-3312-348X","affiliations":[{"raw_affiliation_string":"School of Mathematics and Finance, Chuzhou University, Chuzhou, China","institution_ids":["https://openalex.org/I4210115456"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Shiping Chen","orcid":"https://orcid.org/0009-0002-7585-0715"},"institutions":[{"id":"https://openalex.org/I148128674","display_name":"University of Shanghai for Science and Technology","ror":"https://ror.org/00ay9v204","country_code":"CN","type":"education","lineage":["https://openalex.org/I148128674"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shiping Chen","raw_affiliation_strings":["School of Optical-Electrical and Computer Engineering, University of Shanghai for Science and Technology, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0002-7585-0715","affiliations":[{"raw_affiliation_string":"School of Optical-Electrical and Computer Engineering, University of Shanghai for Science and Technology, Shanghai, China","institution_ids":["https://openalex.org/I148128674"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Guangshun Yao","orcid":"https://orcid.org/0000-0002-8273-1573"},"institutions":[{"id":"https://openalex.org/I4210115456","display_name":"Chuzhou University","ror":"https://ror.org/037663q52","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210115456"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangshun Yao","raw_affiliation_strings":["School of Computer and Information Engineering, Chuzhou University, Chuzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-8273-1573","affiliations":[{"raw_affiliation_string":"School of Computer and Information Engineering, Chuzhou University, Chuzhou, China","institution_ids":["https://openalex.org/I4210115456"]}]},{"author_position":"last","author":{"id":null,"display_name":"Shengxiang Wang","orcid":"https://orcid.org/0000-0003-3241-0349"},"institutions":[{"id":"https://openalex.org/I148128674","display_name":"University of Shanghai for Science and Technology","ror":"https://ror.org/00ay9v204","country_code":"CN","type":"education","lineage":["https://openalex.org/I148128674"]},{"id":"https://openalex.org/I4210115456","display_name":"Chuzhou University","ror":"https://ror.org/037663q52","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210115456"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shengxiang Wang","raw_affiliation_strings":["Business School, University of Shanghai for Science and Technology, Shanghai, China","School of Computer and Information Engineering, Chuzhou University, Chuzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-3241-0349","affiliations":[{"raw_affiliation_string":"Business School, University of Shanghai for Science and Technology, Shanghai, China","institution_ids":["https://openalex.org/I148128674"]},{"raw_affiliation_string":"School of Computer and Information Engineering, Chuzhou University, Chuzhou, China","institution_ids":["https://openalex.org/I4210115456"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I148128674","https://openalex.org/I4210115456"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.71190361,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"75","issue":"3","first_page":"860","last_page":"873"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.4643999934196472,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.4643999934196472,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.43479999899864197,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.026399999856948853,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/job-scheduler","display_name":"Job scheduler","score":0.6919000148773193},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.617900013923645},{"id":"https://openalex.org/keywords/batch-processing","display_name":"Batch processing","score":0.5360999703407288},{"id":"https://openalex.org/keywords/job-shop-scheduling","display_name":"Job shop scheduling","score":0.5314000248908997},{"id":"https://openalex.org/keywords/virtual-machine","display_name":"Virtual machine","score":0.47850000858306885},{"id":"https://openalex.org/keywords/dynamic-priority-scheduling","display_name":"Dynamic priority scheduling","score":0.4426000118255615},{"id":"https://openalex.org/keywords/two-level-scheduling","display_name":"Two-level scheduling","score":0.38909998536109924},{"id":"https://openalex.org/keywords/load-balancing","display_name":"Load balancing (electrical power)","score":0.38040000200271606},{"id":"https://openalex.org/keywords/fair-share-scheduling","display_name":"Fair-share scheduling","score":0.3628999888896942}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8551999926567078},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.7059000134468079},{"id":"https://openalex.org/C111873713","wikidata":"https://www.wikidata.org/wiki/Q1641413","display_name":"Job scheduler","level":3,"score":0.6919000148773193},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.617900013923645},{"id":"https://openalex.org/C172658912","wikidata":"https://www.wikidata.org/wiki/Q661613","display_name":"Batch processing","level":2,"score":0.5360999703407288},{"id":"https://openalex.org/C55416958","wikidata":"https://www.wikidata.org/wiki/Q6206757","display_name":"Job shop scheduling","level":3,"score":0.5314000248908997},{"id":"https://openalex.org/C25344961","wikidata":"https://www.wikidata.org/wiki/Q192726","display_name":"Virtual machine","level":2,"score":0.47850000858306885},{"id":"https://openalex.org/C107568181","wikidata":"https://www.wikidata.org/wiki/Q5319000","display_name":"Dynamic priority scheduling","level":3,"score":0.4426000118255615},{"id":"https://openalex.org/C119948110","wikidata":"https://www.wikidata.org/wiki/Q7858726","display_name":"Two-level scheduling","level":4,"score":0.38909998536109924},{"id":"https://openalex.org/C138959212","wikidata":"https://www.wikidata.org/wiki/Q1806783","display_name":"Load balancing (electrical power)","level":3,"score":0.38040000200271606},{"id":"https://openalex.org/C31689143","wikidata":"https://www.wikidata.org/wiki/Q733809","display_name":"Fair-share scheduling","level":3,"score":0.3628999888896942},{"id":"https://openalex.org/C185874996","wikidata":"https://www.wikidata.org/wiki/Q269699","display_name":"Interdependence","level":2,"score":0.34540000557899475},{"id":"https://openalex.org/C127456818","wikidata":"https://www.wikidata.org/wiki/Q238879","display_name":"Rate-monotonic scheduling","level":4,"score":0.3422999978065491},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3188999891281128},{"id":"https://openalex.org/C172430144","wikidata":"https://www.wikidata.org/wiki/Q17111997","display_name":"Symmetric multiprocessor system","level":2,"score":0.3100999891757965},{"id":"https://openalex.org/C2984822820","wikidata":"https://www.wikidata.org/wiki/Q1123036","display_name":"Processor scheduling","level":3,"score":0.3010999858379364},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.29899999499320984},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.28519999980926514},{"id":"https://openalex.org/C106189395","wikidata":"https://www.wikidata.org/wiki/Q176789","display_name":"Markov decision process","level":3,"score":0.2842000126838684},{"id":"https://openalex.org/C158336966","wikidata":"https://www.wikidata.org/wiki/Q3074426","display_name":"Flow shop scheduling","level":4,"score":0.2777000069618225},{"id":"https://openalex.org/C159886148","wikidata":"https://www.wikidata.org/wiki/Q176645","display_name":"Markov process","level":2,"score":0.2766999900341034},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.2736000120639801},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C171627638","wikidata":"https://www.wikidata.org/wiki/Q6206744","display_name":"Job queue","level":4,"score":0.2646999955177307},{"id":"https://openalex.org/C29140674","wikidata":"https://www.wikidata.org/wiki/Q206637","display_name":"Computer cluster","level":2,"score":0.2590000033378601},{"id":"https://openalex.org/C51332947","wikidata":"https://www.wikidata.org/wiki/Q1172305","display_name":"Shared resource","level":2,"score":0.2508000135421753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tc.2025.3642832","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tc.2025.3642832","pdf_url":null,"source":{"id":"https://openalex.org/S157670870","display_name":"IEEE Transactions on Computers","issn_l":"0018-9340","issn":["0018-9340","1557-9956","2326-3814"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Computers","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8","score":0.6895288825035095}],"awards":[{"id":"https://openalex.org/G3259732648","display_name":null,"funder_award_id":"2408085MF177","funder_id":"https://openalex.org/F4320334897","funder_display_name":"Natural Science Foundation of Anhui Province"}],"funders":[{"id":"https://openalex.org/F4320334897","display_name":"Natural Science Foundation of Anhui Province","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W2501976760","https://openalex.org/W2693539513","https://openalex.org/W2748100587","https://openalex.org/W2968986602","https://openalex.org/W2981604716","https://openalex.org/W2996463161","https://openalex.org/W3021780868","https://openalex.org/W3022298203","https://openalex.org/W3113308572","https://openalex.org/W3129362935","https://openalex.org/W3137919793","https://openalex.org/W3176501660","https://openalex.org/W3205605332","https://openalex.org/W4205601070","https://openalex.org/W4206490489","https://openalex.org/W4206790491","https://openalex.org/W4283367935","https://openalex.org/W4285218726","https://openalex.org/W4292356631","https://openalex.org/W4294000672","https://openalex.org/W4297375194","https://openalex.org/W4313189329","https://openalex.org/W4313535045","https://openalex.org/W4319990461","https://openalex.org/W4387587690","https://openalex.org/W4388153979","https://openalex.org/W4388819716","https://openalex.org/W4388855507","https://openalex.org/W4395047770","https://openalex.org/W4403425915","https://openalex.org/W4404385885","https://openalex.org/W4406138007"],"related_works":[],"abstract_inverted_index":{"GPU":[0],"clusters":[1],"serve":[2],"as":[3,132],"a":[4,11,73,110,123,133,154,160],"cornerstone":[5],"of":[6,14,25,97,183],"high-performance":[7],"computing":[8],"and":[9,28,64,83,95,103,122,180,203],"support":[10],"wide":[12],"range":[13],"batch":[15,26,91,115],"jobs":[16,27],"with":[17],"complex":[18],"resource":[19,29,53,62,117,120],"demands.":[20],"However,":[21],"the":[22,46,60,148,178,184,192],"diverse":[23,61],"requirements":[24],"heterogeneity":[30],"present":[31],"significant":[32],"challenges":[33],"to":[34,58,145,189],"efficient":[35],"scheduling.":[36],"Existing":[37],"approaches":[38],"either":[39],"rely":[40],"on":[41,78,138],"static":[42],"rules":[43],"or":[44],"overlook":[45],"interdependencies":[47,149],"among":[48,150],"virtual":[49,151],"machines":[50],"introduced":[51],"by":[52],"heterogeneity,":[54],"making":[55],"it":[56],"difficult":[57],"address":[59],"demands":[63],"dynamic":[65,161],"load":[66,93,198],"balancing.":[67],"In":[68],"this":[69,139],"paper,":[70],"we":[71],"propose":[72,109],"novel":[74],"scheduling":[75,96,128,157,186],"model":[76,112],"based":[77,137],"Graph":[79],"Neural":[80],"Networks":[81],"(GNNs)":[82],"Double":[84],"Deep":[85],"Q-Networks":[86],"(DDQNs),":[87],"termed":[88],"GNN-DDQN,":[89],"for":[90],"job":[92,116,200],"balancing":[94],"multi-dimensional":[98,119],"resources":[99],"(e.g.":[100],"GPU,":[101],"CPU":[102],"memory)":[104],"in":[105,197],"heterogeneous":[106],"clusters.":[107],"We":[108],"system":[111],"that":[113],"integrates":[114],"requests,":[118],"configurations,":[121],"multi-objective":[124],"optimization":[125],"framework.":[126],"The":[127],"problem":[129],"is":[130,143],"formulated":[131],"Markov":[134],"Decision":[135],"Process":[136],"model.":[140,187],"A":[141],"GNN":[142],"employed":[144],"effectively":[146],"capture":[147],"machines,":[152],"while":[153],"DDQN":[155],"optimizes":[156],"decisions":[158],"using":[159,170],"target":[162],"network":[163],"update":[164],"mechanism.":[165],"Extensive":[166],"experiments":[167],"are":[168],"conducted":[169],"two":[171],"real-world":[172],"Alibaba":[173],"cluster":[174],"traces.":[175],"Results":[176],"demonstrate":[177],"effectiveness":[179],"generalization":[181],"capabilities":[182],"proposed":[185],"Compared":[188],"baseline":[190],"methods,":[191],"results":[193],"confirm":[194],"its":[195],"superiority":[196],"balancing,":[199],"latency,":[201],"fairness,":[202],"time":[204],"efficiency.":[205]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-12-12T00:00:00"}
