{"id":"https://openalex.org/W4405778714","doi":"https://doi.org/10.1109/tpds.2024.3522333","title":"Integrated and Fungible Scheduling of Deep Learning Workloads Using Multi-Agent Reinforcement Learning","display_name":"Integrated and Fungible Scheduling of Deep Learning Workloads Using Multi-Agent Reinforcement Learning","publication_year":2024,"publication_date":"2024-12-25","ids":{"openalex":"https://openalex.org/W4405778714","doi":"https://doi.org/10.1109/tpds.2024.3522333"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2024.3522333","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2024.3522333","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100679362","display_name":"Jialun Li","orcid":"https://orcid.org/0000-0003-0941-9820"},"institutions":[{"id":"https://openalex.org/I4210122543","display_name":"Guangdong Polytechnic Normal University","ror":"https://ror.org/02pcb5m77","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210122543"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jialun Li","raw_affiliation_strings":["School of Computer Science, Guangdong Polytechnic Normal University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-0941-9820","affiliations":[{"raw_affiliation_string":"School of Computer Science, Guangdong Polytechnic Normal University, Guangzhou, China","institution_ids":["https://openalex.org/I4210122543"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041031990","display_name":"Danyang Xiao","orcid":"https://orcid.org/0000-0001-6798-9683"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Danyang Xiao","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-6798-9683","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053396759","display_name":"Diying Yang","orcid":"https://orcid.org/0009-0006-8640-3530"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Diying Yang","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0009-0006-8640-3530","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011076660","display_name":"Xuan Mo","orcid":"https://orcid.org/0000-0001-6858-9307"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuan Mo","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-6858-9307","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5084826798","display_name":"Weigang Wu","orcid":"https://orcid.org/0000-0002-4714-7021"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weigang Wu","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-4714-7021","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.8964,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.7913691,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":"36","issue":"3","first_page":"391","last_page":"406"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10551","display_name":"Scheduling and Optimization Algorithms","score":0.9587000012397766,"subfield":{"id":"https://openalex.org/subfields/2209","display_name":"Industrial and Manufacturing Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10551","display_name":"Scheduling and Optimization Algorithms","score":0.9587000012397766,"subfield":{"id":"https://openalex.org/subfields/2209","display_name":"Industrial and Manufacturing Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8647130727767944},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.8117361068725586},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.629136860370636},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.47156283259391785},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3377436697483063},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.3276441693305969}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8647130727767944},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.8117361068725586},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.629136860370636},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.47156283259391785},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3377436697483063},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.3276441693305969},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpds.2024.3522333","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2024.3522333","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G8226029110","display_name":null,"funder_award_id":"62372487","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":59,"referenced_works":["https://openalex.org/W1948566616","https://openalex.org/W2105947650","https://openalex.org/W2133569115","https://openalex.org/W2194775991","https://openalex.org/W2529337537","https://openalex.org/W2593512320","https://openalex.org/W2798515322","https://openalex.org/W2892341857","https://openalex.org/W2904319713","https://openalex.org/W2919897868","https://openalex.org/W2963125010","https://openalex.org/W2963163009","https://openalex.org/W2964110616","https://openalex.org/W3022298203","https://openalex.org/W3022548332","https://openalex.org/W3121689374","https://openalex.org/W3130571442","https://openalex.org/W3134991928","https://openalex.org/W3138303811","https://openalex.org/W3156295478","https://openalex.org/W3157306683","https://openalex.org/W3191069914","https://openalex.org/W4206790491","https://openalex.org/W4285524851","https://openalex.org/W4297964528","https://openalex.org/W4313855892","https://openalex.org/W4372262787","https://openalex.org/W4376851285","https://openalex.org/W4378979503","https://openalex.org/W4383899813","https://openalex.org/W4385245566","https://openalex.org/W4386249662","https://openalex.org/W4386608891","https://openalex.org/W4386902902","https://openalex.org/W4387008005","https://openalex.org/W4387042170","https://openalex.org/W6636079778","https://openalex.org/W6639249596","https://openalex.org/W6640090968","https://openalex.org/W6683195989","https://openalex.org/W6683204974","https://openalex.org/W6684191040","https://openalex.org/W6717018068","https://openalex.org/W6726873649","https://openalex.org/W6727690538","https://openalex.org/W6734517396","https://openalex.org/W6747473740","https://openalex.org/W6755069753","https://openalex.org/W6756009870","https://openalex.org/W6758283263","https://openalex.org/W6759814162","https://openalex.org/W6769424276","https://openalex.org/W6779416026","https://openalex.org/W6780559895","https://openalex.org/W6781728138","https://openalex.org/W6782839094","https://openalex.org/W6793903029","https://openalex.org/W6839007396","https://openalex.org/W7067822191"],"related_works":["https://openalex.org/W4306904969","https://openalex.org/W2138720691","https://openalex.org/W4362501864","https://openalex.org/W4380318855","https://openalex.org/W3084456289","https://openalex.org/W2024136090","https://openalex.org/W4391331176","https://openalex.org/W2031695474","https://openalex.org/W2586732548","https://openalex.org/W2964765435"],"abstract_inverted_index":{"GPU":[0,197],"clusters":[1],"have":[2],"been":[3],"widely":[4],"used":[5],"to":[6,66,132,147,173,209],"co-locate":[7],"various":[8],"deep":[9],"learning":[10,94],"(DL)":[11],"workloads":[12,32],"in":[13,123,211,217],"a":[14,74,90,194],"multi-tenant":[15],"way.":[16],"Although":[17],"such":[18,73],"resource":[19,26,60,70,103,109,113,137,150],"sharing":[20],"can":[21,202],"significantly":[22,76],"reduce":[23],"training":[24],"cost,":[25],"contention":[27],"and":[28,38,59,62,68,102,106,134,188,215],"interference":[29],"among":[30],"co-located":[31],"make":[33],"task":[34,57,100,163,176],"scheduling":[35,43,52,82,96,205],"very":[36],"complex":[37],"challenging.":[39],"To":[40],"simplify":[41],"the":[42,49,78],"problem,":[44],"existing":[45],"algorithms":[46,206],"usually":[47],"divide":[48],"procedure":[50],"of":[51,80,115,120,166,213,219],"into":[53],"two":[54,124,181],"sub-tasks,":[55],"i.e.,":[56],"placement":[58,101,164],"allocation,":[61],"allocate":[63],"resources":[64],"according":[65],"pre-defined":[67],"fixed":[69],"demands.":[71],"However,":[72],"paradigm":[75],"constrains":[77],"selection":[79],"potential":[81,175],"solutions.":[83,152],"In":[84],"this":[85],"article,":[86],"we":[87],"present":[88],"MAIFS,":[89],"novel":[91],"multi-agent":[92,127],"reinforcement":[93],"based":[95,111,192,196],"algorithm":[97],"that":[98,200],"handles":[99],"allocation":[104,110,151],"integratedly,":[105],"allows":[107],"fungible":[108,149],"on":[112,193],"sensitivity":[114],"DL":[116,185],"workloads.":[117],"The":[118,126,153],"core":[119],"MAIFS":[121,201],"lies":[122],"mechanisms.":[125],"attention":[128],"mechanism":[129,157],"is":[130,158],"designed":[131,159],"learn":[133],"share":[135],"inter-related":[136],"state":[138],"features":[139],"observed":[140],"from":[141],"different":[142],"agents,":[143],"which":[144],"enables":[145],"agents":[146,167],"explore":[148],"dynamic":[154],"coordination":[155],"graph":[156],"for":[160],"coordinating":[161],"interactive":[162],"decisions":[165],"during":[168],"integrated":[169],"scheduling,":[170],"so":[171],"as":[172],"mitigate":[174],"conflicts.":[177],"Simulated":[178],"experiments":[179,191],"using":[180],"large":[182],"scale":[183],"production":[184],"workload":[186],"traces":[187],"physical":[189],"deployment":[190],"Kubernetes":[195],"cluster":[198],"show":[199],"outperform":[203],"state-of-the-art":[204],"by":[207],"up":[208],"44%":[210],"terms":[212,218],"makespan":[214],"46%":[216],"job":[220],"completion":[221],"time":[222],"(JCT).":[223]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":1}],"updated_date":"2026-06-24T13:16:06.693445","created_date":"2025-10-10T00:00:00"}
