{"id":"https://openalex.org/W4225854101","doi":"https://doi.org/10.1109/tnsm.2021.3139607","title":"Large-Scale Machine Learning Cluster Scheduling via Multi-Agent Graph Reinforcement Learning","display_name":"Large-Scale Machine Learning Cluster Scheduling via Multi-Agent Graph Reinforcement Learning","publication_year":2021,"publication_date":"2021-12-31","ids":{"openalex":"https://openalex.org/W4225854101","doi":"https://doi.org/10.1109/tnsm.2021.3139607"},"language":"en","primary_location":{"id":"doi:10.1109/tnsm.2021.3139607","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tnsm.2021.3139607","pdf_url":null,"source":{"id":"https://openalex.org/S173527311","display_name":"IEEE Transactions on Network and Service Management","issn_l":"1932-4537","issn":["1932-4537","2373-7379"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Network and Service Management","raw_type":"journal-article"},"type":"preprint","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Xiaoyang Zhao","orcid":"https://orcid.org/0000-0002-8260-2181"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Xiaoyang Zhao","raw_affiliation_strings":["Department of Computer Science, The University of Hong Kong, Hong Kong"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, The University of Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5012597518","display_name":"Chuan Wu","orcid":"https://orcid.org/0000-0002-3144-4398"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Chuan Wu","raw_affiliation_strings":["Department of Computer Science, The University of Hong Kong, Hong Kong"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, The University of Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I889458895"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I889458895"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.35189619,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"19","issue":"4","first_page":"4962","last_page":"4974"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.9926999807357788,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13553","display_name":"Age of Information Optimization","score":0.989799976348877,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7722189426422119},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7537335157394409},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.6810814142227173},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.6141296029090881},{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.5330658555030823},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.4689333736896515},{"id":"https://openalex.org/keywords/job-scheduler","display_name":"Job scheduler","score":0.46833205223083496},{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.4580487310886383},{"id":"https://openalex.org/keywords/network-topology","display_name":"Network topology","score":0.45461735129356384},{"id":"https://openalex.org/keywords/server","display_name":"Server","score":0.4541640877723694},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4526996612548828},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.30085572600364685},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.1953202486038208},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.11309665441513062}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7722189426422119},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7537335157394409},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.6810814142227173},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.6141296029090881},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.5330658555030823},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.4689333736896515},{"id":"https://openalex.org/C111873713","wikidata":"https://www.wikidata.org/wiki/Q1641413","display_name":"Job scheduler","level":3,"score":0.46833205223083496},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.4580487310886383},{"id":"https://openalex.org/C199845137","wikidata":"https://www.wikidata.org/wiki/Q145490","display_name":"Network topology","level":2,"score":0.45461735129356384},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.4541640877723694},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4526996612548828},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.30085572600364685},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.1953202486038208},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.11309665441513062},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tnsm.2021.3139607","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tnsm.2021.3139607","pdf_url":null,"source":{"id":"https://openalex.org/S173527311","display_name":"IEEE Transactions on Network and Service Management","issn_l":"1932-4537","issn":["1932-4537","2373-7379"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Network and Service Management","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8","score":0.699999988079071}],"awards":[{"id":"https://openalex.org/G5807435805","display_name":null,"funder_award_id":"17208920","funder_id":"https://openalex.org/F4320321592","funder_display_name":"Research Grants Council, University Grants Committee"},{"id":"https://openalex.org/G692013346","display_name":null,"funder_award_id":"HKU 17204619","funder_id":"https://openalex.org/F4320321592","funder_display_name":"Research Grants Council, University Grants Committee"},{"id":"https://openalex.org/G7753977827","display_name":null,"funder_award_id":"17207621","funder_id":"https://openalex.org/F4320321592","funder_display_name":"Research Grants Council, University Grants Committee"},{"id":"https://openalex.org/G8044317974","display_name":null,"funder_award_id":"C5026-18G (CRF)","funder_id":"https://openalex.org/F4320321592","funder_display_name":"Research Grants Council, University Grants Committee"}],"funders":[{"id":"https://openalex.org/F4320321592","display_name":"Research Grants Council, University Grants Committee","ror":"https://ror.org/00djwmt25"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W1832693441","https://openalex.org/W2013265062","https://openalex.org/W2021667890","https://openalex.org/W2129542763","https://openalex.org/W2130531694","https://openalex.org/W2141992894","https://openalex.org/W2548012461","https://openalex.org/W2552554409","https://openalex.org/W2606202972","https://openalex.org/W2794260443","https://openalex.org/W2798515322","https://openalex.org/W2907492528","https://openalex.org/W2919897868","https://openalex.org/W2968986602","https://openalex.org/W2980810797","https://openalex.org/W2991046523","https://openalex.org/W2998281477","https://openalex.org/W3000436270","https://openalex.org/W3024450730","https://openalex.org/W3162118826","https://openalex.org/W3185663263","https://openalex.org/W6637373629","https://openalex.org/W6684084819","https://openalex.org/W6713134421","https://openalex.org/W6739901393","https://openalex.org/W6745778385","https://openalex.org/W6756009870","https://openalex.org/W6763760226","https://openalex.org/W6769424276","https://openalex.org/W6784871562"],"related_works":["https://openalex.org/W928209","https://openalex.org/W15790799","https://openalex.org/W18302568","https://openalex.org/W17635708","https://openalex.org/W12884433","https://openalex.org/W9269004","https://openalex.org/W7414709","https://openalex.org/W9163358","https://openalex.org/W1748150","https://openalex.org/W5464513"],"abstract_inverted_index":{"Efficient":[0],"scheduling":[1,105,191],"of":[2,63,117,146,150,171,176,199],"distributed":[3],"deep":[4],"learning":[5,103,208],"(DL)":[6],"jobs":[7,24,32,74],"in":[8,75,93,169,197],"large":[9,86],"GPU":[10,64],"clusters":[11,60],"is":[12,81,162,203],"crucial":[13],"for":[14,174],"resource":[15,26,36],"efficiency":[16],"and":[17,52,78,98,141,181,202],"job":[18,39,111,119,159],"performance.":[19],"While":[20],"server":[21,142],"sharing":[22],"among":[23,29],"improves":[25],"utilization,":[27],"interference":[28,50,160,167],"co-located":[30],"DL":[31],"occurs":[33],"due":[34,83],"to":[35,70,84,107,135,155,165,205],"contention.":[37],"Interference-aware":[38],"placement":[40,112],"has":[41],"been":[42],"studied,":[43],"with":[44,55],"white-box":[45],"approaches":[46],"based":[47],"on":[48],"explicit":[49],"modeling":[51],"black-box":[53],"schedulers":[54,92],"reinforcement":[56,102],"learning.":[57],"In":[58,144],"today\u2019s":[59],"containing":[61],"thousands":[62],"servers,":[65],"running":[66],"a":[67,76,94,100,147,158],"single":[68],"scheduler":[69,187],"manage":[71],"all":[72],"arrival":[73],"timely":[77],"effective":[79],"manner":[80],"challenging,":[82],"the":[85,115,137,177],"workload":[87],"scale.":[88],"We":[89],"adopt":[90],"multiple":[91],"large-scale":[95],"cluster/data":[96],"center,":[97],"propose":[99],"multi-agent":[101],"(MARL)":[104],"framework":[106,129,188],"cooperatively":[108],"learn":[109],"fine-grained":[110],"policies,":[113],"towards":[114],"objective":[116],"minimizing":[118],"completion":[120],"time":[121],"(JCT).":[122],"To":[123],"achieve":[124],"topology-aware":[125],"placements,":[126,157],"our":[127,186],"proposed":[128],"uses":[130],"hierarchical":[131],"graph":[132],"neural":[133],"networks":[134],"encode":[136],"data":[138],"center":[139],"topology":[140],"architecture.":[143],"view":[145],"common":[148],"lack":[149],"precise":[151],"reward":[152],"samples":[153],"corresponding":[154],"different":[156],"model":[161],"further":[163],"devised":[164],"predict":[166],"levels":[168],"face":[170],"various":[172,206],"co-locations,":[173],"training":[175],"MARL":[178],"schedulers.":[179],"Testbed":[180],"trace-driven":[182],"evaluations":[183],"show":[184],"that":[185],"outperforms":[189],"representative":[190],"schemes":[192],"by":[193],"more":[194],"than":[195],"20%":[196],"terms":[198],"average":[200],"JCT,":[201],"adaptive":[204],"machine":[207],"cluster":[209],"topologies.":[210]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2022-05-05T00:00:00"}
