{"id":"https://openalex.org/W4386840193","doi":"https://doi.org/10.1145/3600006.3613152","title":"Oobleck: Resilient Distributed Training of Large Models Using Pipeline Templates","display_name":"Oobleck: Resilient Distributed Training of Large Models Using Pipeline Templates","publication_year":2023,"publication_date":"2023-10-03","ids":{"openalex":"https://openalex.org/W4386840193","doi":"https://doi.org/10.1145/3600006.3613152"},"language":"en","primary_location":{"id":"doi:10.1145/3600006.3613152","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3600006.3613152","pdf_url":"https://dl.acm.org/action/downloadSupplement?doi=10.1145%2F3600006.3613152&file=p382-jang-supp.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th Symposium on Operating Systems Principles","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/action/downloadSupplement?doi=10.1145%2F3600006.3613152&file=p382-jang-supp.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5064139539","display_name":"Insu Jang","orcid":"https://orcid.org/0009-0007-5206-2333"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Insu Jang","raw_affiliation_strings":["University of Michigan, Ann Arbor, MI, USA"],"affiliations":[{"raw_affiliation_string":"University of Michigan, Ann Arbor, MI, USA","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100785449","display_name":"Zhenning Yang","orcid":"https://orcid.org/0009-0003-0813-5911"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhenning Yang","raw_affiliation_strings":["University of Michigan, Ann Arbor, MI, USA"],"affiliations":[{"raw_affiliation_string":"University of Michigan, Ann Arbor, MI, USA","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029095203","display_name":"Zhen Zhang","orcid":"https://orcid.org/0000-0002-0164-0849"},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhen Zhang","raw_affiliation_strings":["Amazon Web Services, Santa Clara, CA, United States of America"],"affiliations":[{"raw_affiliation_string":"Amazon Web Services, Santa Clara, CA, United States of America","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101882004","display_name":"Xin Jin","orcid":"https://orcid.org/0000-0001-8741-5847"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xin Jin","raw_affiliation_strings":["Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5013180923","display_name":"Mosharaf Chowdhury","orcid":"https://orcid.org/0000-0003-0884-6740"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mosharaf Chowdhury","raw_affiliation_strings":["University of Michigan, Ann Arbor, MI, United States of America"],"affiliations":[{"raw_affiliation_string":"University of Michigan, Ann Arbor, MI, United States of America","institution_ids":["https://openalex.org/I27837315"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5064139539"],"corresponding_institution_ids":["https://openalex.org/I27837315"],"apc_list":null,"apc_paid":null,"fwci":8.1487,"has_fulltext":true,"cited_by_count":27,"citation_normalized_percentile":{"value":0.98552271,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"382","last_page":"395"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9896000027656555,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9896000027656555,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9894999861717224,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.9894999861717224,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.8655155897140503},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8191916942596436},{"id":"https://openalex.org/keywords/template","display_name":"Template","score":0.7904355525970459},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.6653789281845093},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.6633689403533936},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5407149791717529},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.5307830572128296},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.47367364168167114},{"id":"https://openalex.org/keywords/cover","display_name":"Cover (algebra)","score":0.4255576729774475},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.341646671295166},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.18723177909851074},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.12035730481147766},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.09452521800994873},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.07889944314956665}],"concepts":[{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.8655155897140503},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8191916942596436},{"id":"https://openalex.org/C82714645","wikidata":"https://www.wikidata.org/wiki/Q438331","display_name":"Template","level":2,"score":0.7904355525970459},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.6653789281845093},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.6633689403533936},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5407149791717529},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.5307830572128296},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.47367364168167114},{"id":"https://openalex.org/C2780428219","wikidata":"https://www.wikidata.org/wiki/Q16952335","display_name":"Cover (algebra)","level":2,"score":0.4255576729774475},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.341646671295166},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.18723177909851074},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.12035730481147766},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.09452521800994873},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.07889944314956665},{"id":"https://openalex.org/C78519656","wikidata":"https://www.wikidata.org/wiki/Q101333","display_name":"Mechanical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C555944384","wikidata":"https://www.wikidata.org/wiki/Q249","display_name":"Wireless","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3600006.3613152","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3600006.3613152","pdf_url":"https://dl.acm.org/action/downloadSupplement?doi=10.1145%2F3600006.3613152&file=p382-jang-supp.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th Symposium on Operating Systems Principles","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2309.08125","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2309.08125","pdf_url":"https://arxiv.org/pdf/2309.08125","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3600006.3613152","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3600006.3613152","pdf_url":"https://dl.acm.org/action/downloadSupplement?doi=10.1145%2F3600006.3613152&file=p382-jang-supp.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 29th Symposium on Operating Systems Principles","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3434055629","display_name":null,"funder_award_id":"CNS-2104243","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G3700094867","display_name":null,"funder_award_id":"2106184","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G4702057519","display_name":null,"funder_award_id":"2104243","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G5390123671","display_name":null,"funder_award_id":"10424","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G618240637","display_name":null,"funder_award_id":"1909067","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8061712885","display_name":null,"funder_award_id":"CNS-2106184","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G848032724","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320316785","display_name":"VMware","ror":null}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4386840193.pdf","grobid_xml":"https://content.openalex.org/works/W4386840193.grobid-xml"},"referenced_works_count":26,"referenced_works":["https://openalex.org/W1515641485","https://openalex.org/W2007930753","https://openalex.org/W2056653303","https://openalex.org/W2142812297","https://openalex.org/W2332901121","https://openalex.org/W2767346922","https://openalex.org/W2963341956","https://openalex.org/W2969388332","https://openalex.org/W2979826702","https://openalex.org/W3081168214","https://openalex.org/W3086105743","https://openalex.org/W3129831491","https://openalex.org/W3130934537","https://openalex.org/W3132107458","https://openalex.org/W3176009043","https://openalex.org/W3204998121","https://openalex.org/W3205803342","https://openalex.org/W3206832494","https://openalex.org/W4220741164","https://openalex.org/W4226012237","https://openalex.org/W4226199141","https://openalex.org/W4245656449","https://openalex.org/W4311327447","https://openalex.org/W4327694855","https://openalex.org/W4386117806","https://openalex.org/W4386768656"],"related_works":["https://openalex.org/W2121300814","https://openalex.org/W4231091074","https://openalex.org/W1886613375","https://openalex.org/W4236081792","https://openalex.org/W4250583430","https://openalex.org/W4234406076","https://openalex.org/W2010731026","https://openalex.org/W2375218795","https://openalex.org/W2393010557","https://openalex.org/W1918900381"],"abstract_inverted_index":{"Oobleck":[0,61,105],"enables":[1],"resilient":[2],"distributed":[3],"training":[4],"of":[5,25,67,101],"large":[6,96],"DNN":[7,97],"models":[8,98],"with":[9,99],"guaranteed":[10],"fault":[11,114],"tolerance.":[12],"It":[13],"takes":[14],"a":[15,23],"planning-execution":[16],"co-design":[17],"approach,":[18],"where":[19],"it":[20,48,111],"first":[21],"generates":[22],"set":[24],"heterogeneous":[26],"pipeline":[27,38,71],"templates":[28,72],"and":[29,110,119],"instantiates":[30],"at":[31,91],"least":[32],"f":[33,43,82],"+":[34],"1":[35],"logically":[36],"equivalent":[37],"replicas":[39,56],"to":[40,57,76,123],"tolerate":[41],"any":[42],"simultaneous":[44,85],"failures.":[45],"During":[46],"execution,":[47],"relies":[49],"on":[50,95],"already-replicated":[51],"model":[52],"states":[53],"across":[54],"the":[55,68],"provide":[58],"fast":[59],"recovery.":[60],"provably":[62],"guarantees":[63],"that":[64,104],"some":[65],"combination":[66],"initially":[69],"created":[70],"can":[73],"be":[74],"used":[75],"cover":[77],"all":[78,92],"available":[79],"resources":[80],"after":[81],"or":[83],"fewer":[84],"failures,":[86],"thereby":[87],"avoiding":[88],"resource":[89],"idling":[90],"times.":[93],"Evaluation":[94],"billions":[100],"parameters":[102],"shows":[103],"provides":[106],"consistently":[107],"high":[108],"throughput,":[109],"outperforms":[112],"state-of-the-art":[113],"tolerance":[115],"solutions":[116],"like":[117],"Bamboo":[118],"Varuna":[120],"by":[121],"up":[122],"13.9\u00d7.":[124]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":15},{"year":2024,"cited_by_count":8},{"year":2023,"cited_by_count":2}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
