{"id":"https://openalex.org/W4386709683","doi":"https://doi.org/10.1145/3605573.3605583","title":"Embracing Uncertainty for Equity in Resource Allocation in ML Training","display_name":"Embracing Uncertainty for Equity in Resource Allocation in ML Training","publication_year":2023,"publication_date":"2023-08-07","ids":{"openalex":"https://openalex.org/W4386709683","doi":"https://doi.org/10.1145/3605573.3605583"},"language":"en","primary_location":{"id":"doi:10.1145/3605573.3605583","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3605573.3605583","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3605573.3605583","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 52nd International Conference on Parallel Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3605573.3605583","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5037178139","display_name":"Suraiya Tairin","orcid":"https://orcid.org/0009-0004-1946-5235"},"institutions":[{"id":"https://openalex.org/I51556381","display_name":"University of Virginia","ror":"https://ror.org/0153tk833","country_code":"US","type":"education","lineage":["https://openalex.org/I51556381"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Suraiya Tairin","raw_affiliation_strings":["University of Virginia, USA"],"affiliations":[{"raw_affiliation_string":"University of Virginia, USA","institution_ids":["https://openalex.org/I51556381"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050569064","display_name":"Haiying Shen","orcid":"https://orcid.org/0000-0002-7548-6223"},"institutions":[{"id":"https://openalex.org/I51556381","display_name":"University of Virginia","ror":"https://ror.org/0153tk833","country_code":"US","type":"education","lineage":["https://openalex.org/I51556381"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Haiying Shen","raw_affiliation_strings":["University of Virginia, United States of America"],"affiliations":[{"raw_affiliation_string":"University of Virginia, United States of America","institution_ids":["https://openalex.org/I51556381"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5040160093","display_name":"Zeyu Zhang","orcid":"https://orcid.org/0009-0005-7853-6854"},"institutions":[{"id":"https://openalex.org/I51556381","display_name":"University of Virginia","ror":"https://ror.org/0153tk833","country_code":"US","type":"education","lineage":["https://openalex.org/I51556381"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zeyu Zhang","raw_affiliation_strings":["University of Virginia, USA"],"affiliations":[{"raw_affiliation_string":"University of Virginia, USA","institution_ids":["https://openalex.org/I51556381"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5037178139"],"corresponding_institution_ids":["https://openalex.org/I51556381"],"apc_list":null,"apc_paid":null,"fwci":2.2464,"has_fulltext":true,"cited_by_count":5,"citation_normalized_percentile":{"value":0.9042356,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"423","last_page":"432"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9937999844551086,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7515443563461304},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.6584513783454895},{"id":"https://openalex.org/keywords/node","display_name":"Node (physics)","score":0.47263476252555847},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.46083053946495056},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.4477357268333435},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.44604766368865967},{"id":"https://openalex.org/keywords/job-scheduler","display_name":"Job scheduler","score":0.4129270613193512},{"id":"https://openalex.org/keywords/operations-research","display_name":"Operations research","score":0.36017119884490967},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.26084429025650024},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.2527904212474823},{"id":"https://openalex.org/keywords/mathematical-optimization","display_name":"Mathematical optimization","score":0.24370181560516357},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.09725555777549744},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.07527932524681091}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7515443563461304},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.6584513783454895},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.47263476252555847},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.46083053946495056},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.4477357268333435},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.44604766368865967},{"id":"https://openalex.org/C111873713","wikidata":"https://www.wikidata.org/wiki/Q1641413","display_name":"Job scheduler","level":3,"score":0.4129270613193512},{"id":"https://openalex.org/C42475967","wikidata":"https://www.wikidata.org/wiki/Q194292","display_name":"Operations research","level":1,"score":0.36017119884490967},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.26084429025650024},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.2527904212474823},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.24370181560516357},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.09725555777549744},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.07527932524681091},{"id":"https://openalex.org/C160403385","wikidata":"https://www.wikidata.org/wiki/Q220543","display_name":"Queue","level":2,"score":0.0},{"id":"https://openalex.org/C66938386","wikidata":"https://www.wikidata.org/wiki/Q633538","display_name":"Structural engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3605573.3605583","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3605573.3605583","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3605573.3605583","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 52nd International Conference on Parallel Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3605573.3605583","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3605573.3605583","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3605573.3605583","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 52nd International Conference on Parallel Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[{"display_name":"Decent work and economic growth","score":0.6100000143051147,"id":"https://metadata.un.org/sdg/8"}],"awards":[{"id":"https://openalex.org/G164031158","display_name":null,"funder_award_id":"1822965","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G2663109616","display_name":null,"funder_award_id":"2206522, 1827674","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G4140588867","display_name":null,"funder_award_id":"2206522","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G7415302333","display_name":null,"funder_award_id":"NSF-1822965","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4386709683.pdf","grobid_xml":"https://content.openalex.org/works/W4386709683.grobid-xml"},"referenced_works_count":26,"referenced_works":["https://openalex.org/W950821216","https://openalex.org/W1575332982","https://openalex.org/W2031407623","https://openalex.org/W2523435939","https://openalex.org/W2751802138","https://openalex.org/W2780958346","https://openalex.org/W2794670651","https://openalex.org/W2798515322","https://openalex.org/W2894576937","https://openalex.org/W2919897868","https://openalex.org/W2920397365","https://openalex.org/W2962684017","https://openalex.org/W2962725887","https://openalex.org/W2964923275","https://openalex.org/W2968986602","https://openalex.org/W3011678990","https://openalex.org/W3097875570","https://openalex.org/W3108033633","https://openalex.org/W3124352525","https://openalex.org/W3155611867","https://openalex.org/W3162118826","https://openalex.org/W3202665973","https://openalex.org/W4285212262","https://openalex.org/W4288079579","https://openalex.org/W4290991121","https://openalex.org/W4361807675"],"related_works":["https://openalex.org/W4306904969","https://openalex.org/W2138720691","https://openalex.org/W4362501864","https://openalex.org/W4380318855","https://openalex.org/W3084456289","https://openalex.org/W2024136090","https://openalex.org/W4391331176","https://openalex.org/W2031695474","https://openalex.org/W44553394","https://openalex.org/W3047653192"],"abstract_inverted_index":{"To":[0,35,117],"reduce":[1,118,165],"the":[2,20,51,65,71,109,119,190],"Deep":[3],"Learning":[4],"(DL)":[5],"model":[6],"training":[7],"time":[8,115],"and":[9,22,61,97,99,170,178],"hence":[10],"resource":[11,26,83,95],"consumption,":[12],"it":[13],"is":[14],"critical":[15],"to":[16,31,80,104,138,154,167,185],"avoid":[17],"stragglers.":[18,157],"However,":[19],"dynamics":[21,60],"uncertainty":[23,62],"features":[24],"of":[25,53,111],"availability":[27],"pose":[28],"a":[29,41,54,126,139,146],"challenge":[30],"avoiding":[32],"stragglers":[33,172],"caused.":[34],"handle":[36,155],"this":[37],"challenge,":[38],"we":[39,123,144],"propose":[40,125,145],"Straggler-Avoiding":[42],"job":[43,55,103,113,137,176],"Scheduling":[44],"approach":[45,132],"(SAS),":[46],"which":[47],"smartly":[48],"ensures":[49],"that":[50,64,134,163],"tasks":[52,66],"receive":[56],"resources":[57],"with":[58,85,92,108,174,189],"similar":[59,93],"so":[63],"can":[67],"complete":[68],"at":[69],"approximately":[70],"same":[72],"time.":[73],"Specifically,":[74],"SAS":[75,164],"uses":[76],"an":[77],"ML":[78],"method":[79,153],"predict":[81],"available":[82,94],"amounts":[84,96],"probability":[86],"in":[87],"future":[88],"times,":[89],"groups":[90],"nodes":[91],"probabilities,":[98],"then":[100],"assigns":[101,135],"each":[102,136],"one":[105],"node":[106,140],"group":[107],"objective":[110],"minimizing":[112],"completion":[114],"(JCT).":[116],"decision":[120],"making":[121],"time,":[122],"also":[124],"reinforcement":[127],"learning":[128],"(RL)":[129],"based":[130],"scheduling":[131],"(SAS-RL)":[133],"group.":[141],"In":[142],"addition,":[143],"distributed":[147],"parameter":[148],"server":[149],"(PS)":[150],"load":[151,181,193],"reassignment":[152,182],"PS":[156,180,192],"Our":[158],"trace-driven":[159],"real":[160],"experiments":[161],"show":[162],"up":[166,184],"45%":[168],"JCT":[169,187],"63%":[171],"compared":[173,188],"existing":[175],"schedulers,":[177],"our":[179],"reduces":[183],"48%":[186],"previous":[191],"distribution":[194],"scheme.":[195]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":3}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
