{"id":"https://openalex.org/W4406302347","doi":"https://doi.org/10.1145/3711927","title":"Taming Flexible Job Packing in Deep Learning Training Clusters","display_name":"Taming Flexible Job Packing in Deep Learning Training Clusters","publication_year":2025,"publication_date":"2025-01-13","ids":{"openalex":"https://openalex.org/W4406302347","doi":"https://doi.org/10.1145/3711927"},"language":"en","primary_location":{"id":"doi:10.1145/3711927","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3711927","pdf_url":null,"source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1145/3711927","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Pengyu Yang","orcid":"https://orcid.org/0009-0004-6225-2139"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Pengyu Yang","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0004-6225-2139","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008837660","display_name":"Weihao Cui","orcid":"https://orcid.org/0000-0002-6646-5260"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]},{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN","SG"],"is_corresponding":false,"raw_author_name":"Weihao Cui","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China and National University of Singapore, Singapore, Singapore","Shanghai Jiao Tong University, Shanghai China","National University of Singapore, Singapore Singapore"],"raw_orcid":"https://orcid.org/0000-0002-6646-5260","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China and National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596","https://openalex.org/I183067930"]},{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"National University of Singapore, Singapore Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111875369","display_name":"Chunyu Xue","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chunyu Xue","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China","Shanghai Jiao Tong University, Shanghai China"],"raw_orcid":"https://orcid.org/0009-0008-9272-1732","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063596525","display_name":"Han Zhao","orcid":"https://orcid.org/0000-0002-1561-5329"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Han Zhao","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China","Shanghai Jiao Tong University, Shanghai China"],"raw_orcid":"https://orcid.org/0000-0002-1561-5329","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100418393","display_name":"Chen Chen","orcid":"https://orcid.org/0000-0001-9480-5632"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chen Chen","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China","Shanghai Jiao Tong University, Shanghai China"],"raw_orcid":"https://orcid.org/0000-0001-9480-5632","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100377840","display_name":"Quan Chen","orcid":"https://orcid.org/0000-0001-5832-0347"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Quan Chen","raw_affiliation_strings":["Department of Computer Science, Shanghai Jiao Tong University, Shanghai, China","Department of Computer Science, Shanghai Jiao Tong University, Shanghai China"],"raw_orcid":"https://orcid.org/0000-0001-5832-0347","affiliations":[{"raw_affiliation_string":"Department of Computer Science, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Department of Computer Science, Shanghai Jiao Tong University, Shanghai China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jing Yang","orcid":"https://orcid.org/0009-0007-8456-2420"},"institutions":[{"id":"https://openalex.org/I178232147","display_name":"Guizhou University","ror":"https://ror.org/02wmsc916","country_code":"CN","type":"education","lineage":["https://openalex.org/I178232147"]},{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jing Yang","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China and State Key Laboratory of Public Big Data, Guizhou University, Guiyang, China","Shanghai Jiao Tong University, Shanghai China","State Key Laboratory of Public Big Data, Guizhou University, Guiyang, China"],"raw_orcid":"https://orcid.org/0009-0007-8456-2420","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China and State Key Laboratory of Public Big Data, Guizhou University, Guiyang, China","institution_ids":["https://openalex.org/I178232147","https://openalex.org/I183067930"]},{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"State Key Laboratory of Public Big Data, Guizhou University, Guiyang, China","institution_ids":["https://openalex.org/I178232147"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5039318240","display_name":"Minyi Guo","orcid":"https://orcid.org/0000-0003-0034-2302"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Minyi Guo","raw_affiliation_strings":["Computer Science, Shanghai Jiao Tong University, Shanghai, China","Computer Science, Shanghai Jiao Tong University, Shanghai China"],"raw_orcid":"https://orcid.org/0000-0003-0034-2302","affiliations":[{"raw_affiliation_string":"Computer Science, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Computer Science, Shanghai Jiao Tong University, Shanghai China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":3.8114,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.92145502,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":"22","issue":"1","first_page":"1","last_page":"24"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12288","display_name":"Optimization and Search Problems","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7497979402542114},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.6984748840332031},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.49560654163360596},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.4353453814983368},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.35640838742256165}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7497979402542114},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.6984748840332031},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.49560654163360596},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.4353453814983368},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35640838742256165},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3711927","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3711927","pdf_url":null,"source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3711927","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3711927","pdf_url":null,"source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/8","score":0.6700000166893005,"display_name":"Decent work and economic growth"}],"awards":[{"id":"https://openalex.org/G2857700228","display_name":null,"funder_award_id":"62232011","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6154009287","display_name":null,"funder_award_id":"2023YFB3001504","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"},{"id":"https://openalex.org/G6799037833","display_name":null,"funder_award_id":"62302302, 62232011","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7352368754","display_name":null,"funder_award_id":"62302302","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7516789681","display_name":null,"funder_award_id":"2023YFB3001504","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320309612","display_name":"Natural Science Foundation of Shanghai","ror":null},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W95608104","https://openalex.org/W1494198834","https://openalex.org/W2108598243","https://openalex.org/W2156077332","https://openalex.org/W2194775991","https://openalex.org/W2910100551","https://openalex.org/W2914209329","https://openalex.org/W2926143647","https://openalex.org/W2963163009","https://openalex.org/W2963323070","https://openalex.org/W2963748441","https://openalex.org/W2982083293","https://openalex.org/W3017091196","https://openalex.org/W3022298203","https://openalex.org/W3157306683","https://openalex.org/W3206418100","https://openalex.org/W3208777667","https://openalex.org/W4231332361","https://openalex.org/W4288079579","https://openalex.org/W4313160155","https://openalex.org/W4318541537","https://openalex.org/W4318541676","https://openalex.org/W4372261197","https://openalex.org/W4387321109","https://openalex.org/W4388102940","https://openalex.org/W4394998532"],"related_works":["https://openalex.org/W2731899572","https://openalex.org/W2961085424","https://openalex.org/W3215138031","https://openalex.org/W4306674287","https://openalex.org/W3009238340","https://openalex.org/W4360585206","https://openalex.org/W4321369474","https://openalex.org/W4285208911","https://openalex.org/W4387369504","https://openalex.org/W3046775127"],"abstract_inverted_index":{"Job":[0],"packing":[1,40,56,67,208],"is":[2],"an":[3],"effective":[4],"technique":[5],"to":[6,12,43,47,135,168,201],"harvest":[7],"the":[8,13,48,63,69,94,98,110,147,150,159,205],"idle":[9],"resources":[10],"allocated":[11],"deep":[14],"learning":[15],"(DL)":[16],"training":[17,70,189],"jobs":[18,112],"but":[19,113],"not":[20,105],"fully":[21],"utilized,":[22],"especially":[23],"when":[24],"clusters":[25],"may":[26,32],"experience":[27],"low":[28],"utilization,":[29],"and":[30,52,57,75,97,153,161,180],"users":[31],"overestimate":[33],"their":[34],"resource":[35,154],"needs.":[36],"However,":[37],"existing":[38],"job":[39,55,66,152,207],"techniques":[41],"tend":[42],"be":[44],"conservative":[45],"due":[46],"mismatch":[49],"in":[50,68,140],"scope":[51],"granularity":[53],"between":[54,93],"cluster":[58,71,95,179],"scheduling.":[59],"In":[60],"particular,":[61],"tapping":[62],"potential":[64],"of":[65,119,130,149,186],"requires":[72],"a":[73,84,128,141,176,184],"local":[74],"fine-grained":[76,142],"coordination":[77,108,165],"mechanism.":[78],"To":[79],"this":[80],"end,":[81],"we":[82],"propose":[83],"novel":[85],"job-packing":[86],"middleware":[87],"named":[88],"Gimbal":[89,103,122,174,195],",":[90],"which":[91],"operates":[92],"scheduler":[96],"hardware":[99],"resources.":[100],"As":[101],"middleware,":[102],"must":[104],"only":[106],"facilitate":[107],"among":[109],"packed":[111],"also":[114],"support":[115],"various":[116,169],"scheduling":[117,170,198],"objectives":[118,199],"different":[120,197],"schedulers.":[121],"achieves":[123],"dual":[124],"functionality":[125],"by":[126],"introducing":[127],"set":[129,185],"worker":[131],"calibration":[132],"primitives":[133,145],"designed":[134],"calibrate":[136],"workers\u2019":[137],"execution":[138],"status":[139],"manner.":[143],"The":[144,191],"obscure":[146],"complexity":[148],"underlying":[151],"management":[155],"mechanisms,":[156],"thus":[157],"offering":[158],"generality":[160],"extensibility":[162],"for":[163],"crafting":[164],"policies":[166],"tailored":[167],"objectives.":[171],"We":[172],"implement":[173],"on":[175],"real-world":[177],"GPU":[178],"evaluate":[181],"it":[182],"with":[183,204],"representative":[187],"DL":[188],"jobs.":[190],"results":[192],"show":[193],"that":[194],"improves":[196],"up":[200],"1.32\u00d7":[202],"compared":[203],"state-of-the-art":[206],"techniques.":[209]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
