{"id":"https://openalex.org/W4399282175","doi":"https://doi.org/10.1145/3650200.3656599","title":"Ymir: A Scheduler for Foundation Model Fine-tuning Workloads in Datacenters","display_name":"Ymir: A Scheduler for Foundation Model Fine-tuning Workloads in Datacenters","publication_year":2024,"publication_date":"2024-05-30","ids":{"openalex":"https://openalex.org/W4399282175","doi":"https://doi.org/10.1145/3650200.3656599"},"language":"en","primary_location":{"id":"doi:10.1145/3650200.3656599","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3650200.3656599","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3650200.3656599","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 38th ACM International Conference on Supercomputing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3650200.3656599","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100732619","display_name":"Wei Gao","orcid":"https://orcid.org/0000-0002-7048-1722"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Wei Gao","raw_affiliation_strings":["S-Lab, Nanyang Technological University, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-7048-1722","affiliations":[{"raw_affiliation_string":"S-Lab, Nanyang Technological University, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010841022","display_name":"Weiming Zhuang","orcid":"https://orcid.org/0000-0001-8243-7772"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Weiming Zhuang","raw_affiliation_strings":["Nanyang Technological University, Singapore and Sony AI, Singapore"],"raw_orcid":"https://orcid.org/0000-0001-8243-7772","affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore and Sony AI, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007449276","display_name":"Minghao Li","orcid":"https://orcid.org/0000-0002-5278-5255"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Minghao Li","raw_affiliation_strings":["Nanyang Technological University, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-5278-5255","affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100614087","display_name":"Peng Sun","orcid":"https://orcid.org/0000-0001-8456-0491"},"institutions":[{"id":"https://openalex.org/I4391012619","display_name":"Shanghai Artificial Intelligence Laboratory","ror":"https://ror.org/03wkvpx79","country_code":null,"type":"facility","lineage":["https://openalex.org/I4391012619"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peng Sun","raw_affiliation_strings":["SenseTime, China and Shanghai AI Lab, China"],"raw_orcid":"https://orcid.org/0000-0001-8456-0491","affiliations":[{"raw_affiliation_string":"SenseTime, China and Shanghai AI Lab, China","institution_ids":["https://openalex.org/I4391012619"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041572550","display_name":"Yonggang Wen","orcid":"https://orcid.org/0000-0002-2751-5114"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Yonggang Wen","raw_affiliation_strings":["Nanyang Technological University, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-2751-5114","affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101591101","display_name":"Tianwei Zhang","orcid":"https://orcid.org/0000-0001-6595-6650"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Tianwei Zhang","raw_affiliation_strings":["Nanyang Technological University, Singapore"],"raw_orcid":"https://orcid.org/0000-0001-6595-6650","affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore","institution_ids":["https://openalex.org/I172675005"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100732619"],"corresponding_institution_ids":["https://openalex.org/I172675005"],"apc_list":null,"apc_paid":null,"fwci":0.6623,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.73136572,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"259","last_page":"271"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10764","display_name":"Privacy-Preserving Technologies in Data","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10764","display_name":"Privacy-Preserving Technologies in Data","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7839808464050293},{"id":"https://openalex.org/keywords/foundation","display_name":"Foundation (evidence)","score":0.5394637584686279},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.4769655466079712},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.4086175560951233},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.3640671968460083},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.33687537908554077},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.08111140131950378}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7839808464050293},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.5394637584686279},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.4769655466079712},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4086175560951233},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.3640671968460083},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.33687537908554077},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.08111140131950378},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3650200.3656599","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3650200.3656599","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3650200.3656599","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 38th ACM International Conference on Supercomputing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3650200.3656599","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3650200.3656599","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3650200.3656599","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 38th ACM International Conference on Supercomputing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4399282175.pdf"},"referenced_works_count":24,"referenced_works":["https://openalex.org/W12634471","https://openalex.org/W2466175319","https://openalex.org/W2535690855","https://openalex.org/W2798515322","https://openalex.org/W2901299405","https://openalex.org/W2962725887","https://openalex.org/W2964223283","https://openalex.org/W2981848390","https://openalex.org/W2982303846","https://openalex.org/W2990761674","https://openalex.org/W3022298203","https://openalex.org/W3035990676","https://openalex.org/W3036601975","https://openalex.org/W3081168214","https://openalex.org/W3135367836","https://openalex.org/W3173273620","https://openalex.org/W3204998121","https://openalex.org/W3206418100","https://openalex.org/W3213001067","https://openalex.org/W4220741164","https://openalex.org/W4225875565","https://openalex.org/W4308426163","https://openalex.org/W4372346804","https://openalex.org/W4375868763"],"related_works":["https://openalex.org/W2381393187","https://openalex.org/W2332779545","https://openalex.org/W2358060160","https://openalex.org/W2035483685","https://openalex.org/W1969764885","https://openalex.org/W596947562","https://openalex.org/W2793937822","https://openalex.org/W2790817834","https://openalex.org/W2220552745","https://openalex.org/W2777605427"],"abstract_inverted_index":{"The":[0],"breakthrough":[1],"of":[2,29,51,106,129],"foundation":[3,6],"models":[4],"makes":[5],"model":[7,21],"fine-tuning":[8,104],"(FMF)":[9],"workloads":[10,53,67,80,85,108],"prevalent":[11],"in":[12,34,54],"modern":[13],"GPU":[14,55],"datacenters.":[15,56],"However,":[16],"existing":[17,146],"schedulers":[18],"tailored":[19],"for":[20],"training":[22],"do":[23],"not":[24],"consider":[25],"the":[26,40,49,59,74,87,94,103,111,127,134,157],"unique":[27],"characteristics":[28],"FMs,":[30],"making":[31],"them":[32],"inefficient":[33],"handling":[35],"FMF":[36,52,66,79,84,107],"workloads.":[37],"To":[38],"bridge":[39],"gap,":[41],"we":[42],"propose":[43],"Ymir,":[44],"a":[45],"scheduler":[46],"to":[47,64,92,109,125,141],"improve":[48,93],"efficiency":[50,96],"Ymir":[57,72,101,131],"leverages":[58],"shared":[60],"FM":[61,89],"backbone":[62],"architecture":[63],"expedite":[65],"from":[68],"two":[69],"aspects:":[70],"(1)":[71],"investigates":[73],"task":[75,158],"transferability":[76],"among":[77],"different":[78],"and":[81,121],"automatically":[82],"merges":[83],"with":[86,145],"same":[88],"into":[90],"one":[91],"cluster-wide":[95],"via":[97],"transfer":[98],"learning.":[99],"(2)":[100],"reuses":[102],"runtime":[105],"reduce":[110,133],"significant":[112],"context":[113],"switch":[114],"overhead.":[115],"We":[116],"conduct":[117],"32-GPU":[118],"physical":[119],"experiments":[120],"240-GPU":[122],"trace-driven":[123],"simulations":[124],"validate":[126],"effectiveness":[128],"Ymir.":[130],"can":[132,163],"average":[135],"job":[136],"completion":[137],"time":[138],"by":[139,154],"up":[140],"4.3":[142],"\u00d7":[143],"compared":[144],"state-of-the-art":[147],"schedulers.":[148],"It":[149],"also":[150],"promotes":[151],"scheduling":[152],"fairness":[153],"fully":[155],"exploiting":[156],"transferability.":[159],"More":[160],"supplementary":[161],"materials":[162],"be":[164],"found":[165],"on":[166],"our":[167],"project":[168],"website":[169],"https://sites.google.com/view/ymir-project.":[170]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
