{"id":"https://openalex.org/W4413411893","doi":"https://doi.org/10.1145/3721145.3728488","title":"Fast and Fair Training for Deep Learning in Heterogeneous GPU Clusters","display_name":"Fast and Fair Training for Deep Learning in Heterogeneous GPU Clusters","publication_year":2025,"publication_date":"2025-06-08","ids":{"openalex":"https://openalex.org/W4413411893","doi":"https://doi.org/10.1145/3721145.3728488"},"language":"en","primary_location":{"id":"doi:10.1145/3721145.3728488","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3721145.3728488","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3721145.3728488","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 39th ACM International Conference on Supercomputing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3721145.3728488","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5005355007","display_name":"Zizhao Mo","orcid":"https://orcid.org/0000-0002-3590-4400"},"institutions":[{"id":"https://openalex.org/I204512498","display_name":"University of Macau","ror":"https://ror.org/01r4q9n85","country_code":"MO","type":"education","lineage":["https://openalex.org/I204512498"]}],"countries":["MO"],"is_corresponding":true,"raw_author_name":"Zizhao Mo","raw_affiliation_strings":["University of Macau, Macau, China"],"affiliations":[{"raw_affiliation_string":"University of Macau, Macau, China","institution_ids":["https://openalex.org/I204512498"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017636962","display_name":"Huanle Xu","orcid":"https://orcid.org/0000-0001-6657-1154"},"institutions":[{"id":"https://openalex.org/I204512498","display_name":"University of Macau","ror":"https://ror.org/01r4q9n85","country_code":"MO","type":"education","lineage":["https://openalex.org/I204512498"]}],"countries":["MO"],"is_corresponding":false,"raw_author_name":"Huanle Xu","raw_affiliation_strings":["University of Macau, Macau, China"],"affiliations":[{"raw_affiliation_string":"University of Macau, Macau, China","institution_ids":["https://openalex.org/I204512498"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5020413351","display_name":"Wing Cheong Lau","orcid":"https://orcid.org/0000-0003-1179-7855"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Wing Cheong Lau","raw_affiliation_strings":["The Chinese University of Hong Kong, Hong Kong, China"],"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong, Hong Kong, China","institution_ids":["https://openalex.org/I177725633"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5005355007"],"corresponding_institution_ids":["https://openalex.org/I204512498"],"apc_list":null,"apc_paid":null,"fwci":2.4268,"has_fulltext":true,"cited_by_count":2,"citation_normalized_percentile":{"value":0.90366227,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"324","last_page":"338"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9961000084877014,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7690960168838501},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.6829985976219177},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.6076115369796753},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5641772747039795},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.5193344354629517},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.514248788356781},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.34448331594467163},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.33741480112075806},{"id":"https://openalex.org/keywords/graphics","display_name":"Graphics","score":0.1382981538772583}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7690960168838501},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.6829985976219177},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.6076115369796753},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5641772747039795},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.5193344354629517},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.514248788356781},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.34448331594467163},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.33741480112075806},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.1382981538772583},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3721145.3728488","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3721145.3728488","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3721145.3728488","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 39th ACM International Conference on Supercomputing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3721145.3728488","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3721145.3728488","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3721145.3728488","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 39th ACM International Conference on Supercomputing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1797736156","display_name":null,"funder_award_id":"MYRG-","funder_id":"https://openalex.org/F4320322841","funder_display_name":"Universidade de Macau"},{"id":"https://openalex.org/G5237518980","display_name":null,"funder_award_id":"-FST-UMDF","funder_id":"https://openalex.org/F4320322841","funder_display_name":"Universidade de Macau"}],"funders":[{"id":"https://openalex.org/F4320311133","display_name":"United Mitochondrial Disease Foundation","ror":"https://ror.org/0528q0t18"},{"id":"https://openalex.org/F4320322841","display_name":"Universidade de Macau","ror":"https://ror.org/01r4q9n85"},{"id":"https://openalex.org/F4320322942","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4413411893.pdf","grobid_xml":"https://content.openalex.org/works/W4413411893.grobid-xml"},"referenced_works_count":28,"referenced_works":["https://openalex.org/W95608104","https://openalex.org/W1583497301","https://openalex.org/W1998047408","https://openalex.org/W2073440460","https://openalex.org/W2133569115","https://openalex.org/W2147740236","https://openalex.org/W2285796902","https://openalex.org/W2340234317","https://openalex.org/W2798515322","https://openalex.org/W2808764965","https://openalex.org/W2951825612","https://openalex.org/W2962752334","https://openalex.org/W2963291073","https://openalex.org/W2964057209","https://openalex.org/W3022298203","https://openalex.org/W3028727260","https://openalex.org/W3204998121","https://openalex.org/W4255654664","https://openalex.org/W4288079579","https://openalex.org/W4318541537","https://openalex.org/W4372262787","https://openalex.org/W4387321109","https://openalex.org/W4394923484","https://openalex.org/W4395020669","https://openalex.org/W4400410044","https://openalex.org/W4401753359","https://openalex.org/W4404787801","https://openalex.org/W4405756102"],"related_works":["https://openalex.org/W2505380084","https://openalex.org/W230091440","https://openalex.org/W4400333498","https://openalex.org/W2233261550","https://openalex.org/W2810751659","https://openalex.org/W258997015","https://openalex.org/W2997094352","https://openalex.org/W2086739451","https://openalex.org/W3183233360","https://openalex.org/W1980160788"],"abstract_inverted_index":{"The":[0],"GPU":[1,64],"device":[2],"heterogeneity":[3],"in":[4,16,62,76,97,123],"accelerating":[5],"deep":[6,59],"learning":[7,60],"training":[8],"workloads":[9],"poses":[10],"significant":[11],"challenges":[12],"for":[13,55],"job":[14,19,27,146],"scheduling":[15,52],"datacenters.Existing":[17],"heterogeneity-aware":[18,155],"schedulers,":[20],"however,":[21],"cannot":[22],"effectively":[23],"reduce":[24],"the":[25,110,137],"overall":[26,138],"completion":[28],"time":[29],"(JCT)":[30],"or":[31],"provide":[32],"fairness":[33,90,96],"guarantees":[34],"due":[35],"to":[36,79,108,142,153],"their":[37],"coarse-grained":[38],"resource":[39,73,84],"allocation":[40,74],"and":[41,57,119],"poor":[42],"integration":[43],"of":[44],"conflicting":[45],"objectives.This":[46],"paper":[47],"presents":[48],"FFT,":[49],"a":[50,72,89,104,114],"novel":[51],"system":[53],"designed":[54],"Fast":[56],"Fair":[58],"Training":[61],"heterogeneous":[63],"clusters.FFT":[65],"incorporates":[66,71],"two":[67],"key":[68],"designs.First,":[69],"it":[70,86],"scheme":[75],"each":[77],"round":[78],"enable":[80],"fine-grained":[81],"control":[82],"over":[83],"utilization.Second,":[85],"seamlessly":[87],"integrates":[88],"compensation":[91],"mechanism":[92],"that":[93,132],"dynamically":[94],"evaluates":[95],"real-time.Building":[98],"upon":[99],"these":[100],"designs,":[101],"FFT":[102,133],"formulates":[103],"cost":[105],"minimization":[106],"problem":[107],"determine":[109],"optimal":[111],"schedule,":[112],"striking":[113],"delicate":[115],"balance":[116],"between":[117],"efficiency":[118],"fairness.Extensive":[120],"experiments":[121],"conducted":[122],"physical":[124],"clusters":[125],"as":[126,128],"well":[127],"large-scale":[129],"testbed":[130],"demonstrate":[131],"can":[134],"significantly":[135],"accelerate":[136],"JCT":[139],"by":[140,148],"up":[141],"5.2":[143],"while":[144],"improving":[145],"finishtime-fairness":[147],"more":[149],"than":[150],"2.2":[151],"compared":[152],"state-of-the-art":[154],"solutions.":[156]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
