{"id":"https://openalex.org/W4415626591","doi":"https://doi.org/10.1109/tpds.2025.3626153","title":"MCG-Sched: Multi-Cluster GPU Scheduling for Resource Fragmentation Reduction and Load Balancing","display_name":"MCG-Sched: Multi-Cluster GPU Scheduling for Resource Fragmentation Reduction and Load Balancing","publication_year":2025,"publication_date":"2025-10-28","ids":{"openalex":"https://openalex.org/W4415626591","doi":"https://doi.org/10.1109/tpds.2025.3626153"},"language":null,"primary_location":{"id":"doi:10.1109/tpds.2025.3626153","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2025.3626153","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101008684","display_name":"Haijie Wu","orcid":"https://orcid.org/0009-0008-0626-6949"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Haijie Wu","raw_affiliation_strings":["School of Computer Science and Engineering, South China University of Technology, Guangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Xinhua Wang","orcid":"https://orcid.org/0000-0001-7346-0823"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinhua Wang","raw_affiliation_strings":["School of Computer Science and Engineering, South China University of Technology, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-7346-0823","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103137239","display_name":"Xiaoxuan Luo","orcid":null},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoxuan Luo","raw_affiliation_strings":["School of Computer Science and Engineering, South China University of Technology, Guangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029924203","display_name":"Wangbo Shen","orcid":"https://orcid.org/0000-0003-3761-843X"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wangbo Shen","raw_affiliation_strings":["School of Computer Science and Engineering, South China University of Technology, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-3761-843X","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"last","author":{"id":null,"display_name":"Weiwei Lin","orcid":"https://orcid.org/0000-0001-6876-1795"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weiwei Lin","raw_affiliation_strings":["School of Computer Science and Engineering, South China University of Technology, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-6876-1795","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101008684"],"corresponding_institution_ids":["https://openalex.org/I90610280"],"apc_list":null,"apc_paid":null,"fwci":2.9051,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.93506509,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"36","issue":"12","first_page":"2789","last_page":"2800"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.41040000319480896,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.41040000319480896,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.18449999392032623,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.16920000314712524,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/load-balancing","display_name":"Load balancing (electrical power)","score":0.8241000175476074},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.8151999711990356},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.6535000205039978},{"id":"https://openalex.org/keywords/data-center","display_name":"Data center","score":0.5038999915122986},{"id":"https://openalex.org/keywords/load-management","display_name":"Load management","score":0.5027999877929688},{"id":"https://openalex.org/keywords/server","display_name":"Server","score":0.42669999599456787},{"id":"https://openalex.org/keywords/computer-cluster","display_name":"Computer cluster","score":0.4156999886035919},{"id":"https://openalex.org/keywords/virtual-machine","display_name":"Virtual machine","score":0.392300009727478},{"id":"https://openalex.org/keywords/idle","display_name":"Idle","score":0.3862999975681305}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8866999745368958},{"id":"https://openalex.org/C138959212","wikidata":"https://www.wikidata.org/wiki/Q1806783","display_name":"Load balancing (electrical power)","level":3,"score":0.8241000175476074},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.8151999711990356},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.6535000205039978},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.625},{"id":"https://openalex.org/C153740404","wikidata":"https://www.wikidata.org/wiki/Q671224","display_name":"Data center","level":2,"score":0.5038999915122986},{"id":"https://openalex.org/C2779370713","wikidata":"https://www.wikidata.org/wiki/Q357554","display_name":"Load management","level":2,"score":0.5027999877929688},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.42669999599456787},{"id":"https://openalex.org/C29140674","wikidata":"https://www.wikidata.org/wiki/Q206637","display_name":"Computer cluster","level":2,"score":0.4156999886035919},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.39329999685287476},{"id":"https://openalex.org/C25344961","wikidata":"https://www.wikidata.org/wiki/Q192726","display_name":"Virtual machine","level":2,"score":0.392300009727478},{"id":"https://openalex.org/C16320812","wikidata":"https://www.wikidata.org/wiki/Q1812200","display_name":"Idle","level":2,"score":0.3862999975681305},{"id":"https://openalex.org/C2984822820","wikidata":"https://www.wikidata.org/wiki/Q1123036","display_name":"Processor scheduling","level":3,"score":0.3831999897956848},{"id":"https://openalex.org/C55416958","wikidata":"https://www.wikidata.org/wiki/Q6206757","display_name":"Job shop scheduling","level":3,"score":0.37310001254081726},{"id":"https://openalex.org/C191015642","wikidata":"https://www.wikidata.org/wiki/Q1132459","display_name":"Fragmentation (computing)","level":2,"score":0.3695000112056732},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.3538999855518341},{"id":"https://openalex.org/C19012869","wikidata":"https://www.wikidata.org/wiki/Q578372","display_name":"Response time","level":2,"score":0.3465000092983246},{"id":"https://openalex.org/C3019591417","wikidata":"https://www.wikidata.org/wiki/Q575614","display_name":"Load distribution","level":2,"score":0.33379998803138733},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.3208000063896179},{"id":"https://openalex.org/C107568181","wikidata":"https://www.wikidata.org/wiki/Q5319000","display_name":"Dynamic priority scheduling","level":3,"score":0.29980000853538513},{"id":"https://openalex.org/C161597957","wikidata":"https://www.wikidata.org/wiki/Q16251507","display_name":"Network Load Balancing Services","level":3,"score":0.2840999960899353},{"id":"https://openalex.org/C31689143","wikidata":"https://www.wikidata.org/wiki/Q733809","display_name":"Fair-share scheduling","level":3,"score":0.2721000015735626},{"id":"https://openalex.org/C175893541","wikidata":"https://www.wikidata.org/wiki/Q1196582","display_name":"Round-robin scheduling","level":4,"score":0.2718000113964081},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.27160000801086426},{"id":"https://openalex.org/C51332947","wikidata":"https://www.wikidata.org/wiki/Q1172305","display_name":"Shared resource","level":2,"score":0.2662000060081482},{"id":"https://openalex.org/C111873713","wikidata":"https://www.wikidata.org/wiki/Q1641413","display_name":"Job scheduler","level":3,"score":0.26420000195503235},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.25679999589920044}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpds.2025.3626153","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2025.3626153","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W3111074096","https://openalex.org/W4205916020","https://openalex.org/W4206790491","https://openalex.org/W4288064595","https://openalex.org/W4294982491","https://openalex.org/W4318541537","https://openalex.org/W4318541676","https://openalex.org/W4372262787","https://openalex.org/W4386436407","https://openalex.org/W4386799881","https://openalex.org/W4388145742","https://openalex.org/W4390650730","https://openalex.org/W4392699261","https://openalex.org/W4392973685","https://openalex.org/W4394745249","https://openalex.org/W4394944658","https://openalex.org/W4399419713","https://openalex.org/W4401176820","https://openalex.org/W4401537153","https://openalex.org/W4402284933","https://openalex.org/W4403728122","https://openalex.org/W4404386172","https://openalex.org/W4406302347","https://openalex.org/W4408049809","https://openalex.org/W4410294630"],"related_works":[],"abstract_inverted_index":{"Since":[0],"the":[1,23,26,136,140,164,170,174,179],"rapid":[2],"development":[3],"of":[4,16,142],"deep":[5],"learning":[6],"(DL)":[7],"technology,":[8],"large-scale":[9],"GPU":[10,47,59,126],"clusters":[11,81,91,96,127,162],"receive":[12],"a":[13,33,69,117,147],"large":[14],"number":[15],"DL":[17],"workloads":[18,27,180],"daily.":[19],"To":[20],"speed":[21],"up":[22,207],"completion":[24],"time,":[25],"usually":[28],"occupy":[29],"several":[30],"GPUs":[31,206],"on":[32,65],"server.":[34],"However,":[35],"workload":[36,143,153,171,211],"scheduling":[37,67,118,160],"inevitably":[38],"generates":[39],"resource":[40,55,60,66,101,122,191],"fragmentation,":[41,61],"which":[42,103],"results":[43],"in":[44,124,152,173],"many":[45],"scattered":[46],"resources":[48,93,138],"being":[49],"unavailable.":[50],"Existing":[51],"works":[52],"address":[53],"improving":[54,100],"utilization":[56,192],"by":[57,108,181,214],"reducing":[58],"while":[62,94,99,128],"they":[63],"focus":[64],"for":[68],"single":[70],"cluster":[71],"and":[72,82,145,177,185,193,210,223],"ignore":[73],"multiple":[74,125],"clusters.":[75,133],"Multi-cluster":[76],"scenarios,":[77],"such":[78],"as":[79],"virtual":[80],"geo-distributed":[83],"clusters,":[84],"require":[85],"load":[86,130,159,165,186,194,197,226],"balancing":[87,131,166,187,195],"to":[88,120,189,208,219],"avoid":[89],"some":[90,95],"exhausting":[92],"are":[97],"idle":[98],"utilization,":[102],"is":[104],"not":[105],"well":[106],"addressed":[107],"existing":[109,220],"works.":[110],"In":[111],"this":[112],"paper,":[113],"we":[114],"propose":[115],"MCG-Sched,":[116],"strategy":[119],"reduce":[121],"fragmentation":[123,151,183],"maintaining":[129],"among":[132],"MCG-Sched":[134,156,168,203],"measures":[135],"fragmented":[137],"with":[139],"distribution":[141],"demands":[144],"uses":[146],"scheme":[148],"that":[149,202],"minimizes":[150],"scheduling.":[154],"Meanwhile,":[155],"achieves":[157,224],"balanced":[158],"across":[161],"through":[163],"index.":[167],"senses":[169],"requests":[172],"waiting":[175,212],"queue,":[176],"prioritizes":[178],"combining":[182],"measurement":[184],"index":[188],"maximize":[190],"during":[196],"peak.":[198],"Our":[199],"experiments":[200],"show":[201],"reduces":[204],"unallocated":[205],"1.45\u00d7":[209],"time":[213],"more":[215],"than":[216],"40%":[217],"compared":[218],"fragmentation-aware":[221],"methods":[222],"effective":[225],"balancing.":[227]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-29T00:00:00"}
