{"id":"https://openalex.org/W7124142110","doi":"https://doi.org/10.1109/icpads67057.2025.11323035","title":"FEDeS: Fair, Efficient, and Reliable Multi-Tenant Deep Learning Training with Serverless Computing","display_name":"FEDeS: Fair, Efficient, and Reliable Multi-Tenant Deep Learning Training with Serverless Computing","publication_year":2025,"publication_date":"2025-12-14","ids":{"openalex":"https://openalex.org/W7124142110","doi":"https://doi.org/10.1109/icpads67057.2025.11323035"},"language":null,"primary_location":{"id":"doi:10.1109/icpads67057.2025.11323035","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icpads67057.2025.11323035","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 31th International Conference on Parallel and Distributed Systems (ICPADS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123016329","display_name":"Yeonhyeok Jeong","orcid":null},"institutions":[{"id":"https://openalex.org/I48566637","display_name":"Ulsan National Institute of Science and Technology","ror":"https://ror.org/017cjz748","country_code":"KR","type":"education","lineage":["https://openalex.org/I48566637"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Yeonhyeok Jeong","raw_affiliation_strings":["UNIST"],"affiliations":[{"raw_affiliation_string":"UNIST","institution_ids":["https://openalex.org/I48566637"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Seungmin Lee","orcid":null},"institutions":[{"id":"https://openalex.org/I48566637","display_name":"Ulsan National Institute of Science and Technology","ror":"https://ror.org/017cjz748","country_code":"KR","type":"education","lineage":["https://openalex.org/I48566637"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Seungmin Lee","raw_affiliation_strings":["UNIST"],"affiliations":[{"raw_affiliation_string":"UNIST","institution_ids":["https://openalex.org/I48566637"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123010722","display_name":"Seonghyeon Jue","orcid":null},"institutions":[{"id":"https://openalex.org/I48566637","display_name":"Ulsan National Institute of Science and Technology","ror":"https://ror.org/017cjz748","country_code":"KR","type":"education","lineage":["https://openalex.org/I48566637"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Seonghyeon Jue","raw_affiliation_strings":["UNIST"],"affiliations":[{"raw_affiliation_string":"UNIST","institution_ids":["https://openalex.org/I48566637"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070346739","display_name":"Sam H. Noh","orcid":"https://orcid.org/0000-0002-9152-0321"},"institutions":[{"id":"https://openalex.org/I859038795","display_name":"Virginia Tech","ror":"https://ror.org/02smfhw86","country_code":"US","type":"education","lineage":["https://openalex.org/I859038795"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sam H. Noh","raw_affiliation_strings":["Virginia Tech"],"affiliations":[{"raw_affiliation_string":"Virginia Tech","institution_ids":["https://openalex.org/I859038795"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017226492","display_name":"Young-ri Choi","orcid":"https://orcid.org/0009-0008-7475-109X"},"institutions":[{"id":"https://openalex.org/I48566637","display_name":"Ulsan National Institute of Science and Technology","ror":"https://ror.org/017cjz748","country_code":"KR","type":"education","lineage":["https://openalex.org/I48566637"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Young-ri Choi","raw_affiliation_strings":["UNIST"],"affiliations":[{"raw_affiliation_string":"UNIST","institution_ids":["https://openalex.org/I48566637"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5123016329"],"corresponding_institution_ids":["https://openalex.org/I48566637"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.81406459,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.6365000009536743,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.6365000009536743,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.1664000004529953,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.060100000351667404,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.7828999757766724},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.6656000018119812},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.5737000107765198},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.5178999900817871},{"id":"https://openalex.org/keywords/flexibility","display_name":"Flexibility (engineering)","score":0.4505999982357025},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4032000005245209},{"id":"https://openalex.org/keywords/elasticity","display_name":"Elasticity (physics)","score":0.3903999924659729}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8353999853134155},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.7828999757766724},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.6656000018119812},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.5737000107765198},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5461000204086304},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5343999862670898},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.5178999900817871},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.4505999982357025},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4032000005245209},{"id":"https://openalex.org/C121854251","wikidata":"https://www.wikidata.org/wiki/Q62932","display_name":"Elasticity (physics)","level":2,"score":0.3903999924659729},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.3458999991416931},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.29339998960494995},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.28839999437332153},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.2874999940395355},{"id":"https://openalex.org/C513985346","wikidata":"https://www.wikidata.org/wiki/Q270471","display_name":"Virtualization","level":3,"score":0.2800000011920929},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2612999975681305},{"id":"https://openalex.org/C119898033","wikidata":"https://www.wikidata.org/wiki/Q3433888","display_name":"Ensemble forecasting","level":2,"score":0.257099986076355},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.2531000077724457},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icpads67057.2025.11323035","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icpads67057.2025.11323035","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 31th International Conference on Parallel and Distributed Systems (ICPADS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/8","display_name":"Decent work and economic growth","score":0.4191230237483978}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W162895179","https://openalex.org/W2045271686","https://openalex.org/W2060393849","https://openalex.org/W2107098536","https://openalex.org/W2108598243","https://openalex.org/W2112651106","https://openalex.org/W2147524058","https://openalex.org/W2194775991","https://openalex.org/W2402144811","https://openalex.org/W2482213519","https://openalex.org/W2549139847","https://openalex.org/W2813613744","https://openalex.org/W2918828872","https://openalex.org/W2963748441","https://openalex.org/W2987607480","https://openalex.org/W2988070836","https://openalex.org/W3086105743","https://openalex.org/W3159219445","https://openalex.org/W3205260334","https://openalex.org/W4318541537","https://openalex.org/W4386840193","https://openalex.org/W4387302750","https://openalex.org/W4388662057","https://openalex.org/W4398796293"],"related_works":[],"abstract_inverted_index":{"Serverless":[0,64],"computing":[1,8,47],"platforms":[2],"have":[3],"become":[4],"popular":[5],"in":[6],"cloud":[7,21],"environments":[9],"due":[10],"to":[11,76,98],"several":[12],"advantages":[13],"such":[14,145],"as":[15,146],"elasticity":[16],"and":[17,23,57,94,119,128,148,165],"flexibility":[18],"of":[19,25,70,84,157],"using":[20,141,158],"resources":[22],"statelessness":[24],"serverless":[26,46,78,85,160],"functions.":[27,79],"In":[28],"this":[29],"paper,":[30],"we":[31,87],"explore":[32],"integrating":[33],"a":[34,51,71,130,159],"GPU":[35,96],"cluster":[36],"management":[37],"framework":[38],"for":[39,135],"multitenant":[40],"Deep":[41,60],"Learning":[42,61],"(DL)":[43],"training":[44,68,137],"with":[45,63,108],"technologies.":[48],"We":[49,102],"propose":[50,88],"novel":[52],"framework,":[53],"FEDeS":[54],"(Fair,":[55],"Efficient,":[56],"Reliable":[58],"Multi-tenant":[59],"Training":[62],"Computing),":[65],"where":[66],"the":[67,81,115,123,155],"process":[69],"DL":[72,100,136,143],"model":[73],"is":[74],"converted":[75],"chained":[77],"Leveraging":[80],"independent":[82],"execution":[83],"functions,":[86],"gang-relaxed":[89],"scheduling,":[90],"which":[91,113],"enables":[92],"elastic":[93],"flexible":[95],"allocation":[97,106],"each":[99],"job.":[101],"also":[103],"present":[104],"resource":[105],"policies":[107],"generalized":[109],"<tex":[110,124],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[111,125],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$\\alpha$</tex>-fairness,":[112],"controls":[114],"tradeoff":[116],"between":[117],"fairness":[118],"efficiency":[120],"by":[121],"adjusting":[122],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$\\alpha$</tex>":[126],"value,":[127],"devise":[129],"fine-grained":[131],"fault":[132],"tolerance":[133],"method":[134],"jobs.":[138],"Experimental":[139],"results":[140],"various":[142],"models":[144],"BERT":[147],"GPT":[149],"on":[150],"real":[151],"workload":[152],"traces":[153],"show":[154],"benefits":[156],"platform,":[161],"including":[162],"efficiency,":[163],"fairness,":[164],"reliability.":[166]},"counts_by_year":[],"updated_date":"2026-03-25T23:56:10.502304","created_date":"2026-01-15T00:00:00"}
