{"id":"https://openalex.org/W4407130170","doi":"https://doi.org/10.1109/icnp61940.2024.10858582","title":"Non-Idle Machine-Aware Worker Placement for Efficient Distributed Training in GPU Clusters","display_name":"Non-Idle Machine-Aware Worker Placement for Efficient Distributed Training in GPU Clusters","publication_year":2024,"publication_date":"2024-10-28","ids":{"openalex":"https://openalex.org/W4407130170","doi":"https://doi.org/10.1109/icnp61940.2024.10858582"},"language":"en","primary_location":{"id":"doi:10.1109/icnp61940.2024.10858582","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icnp61940.2024.10858582","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 32nd International Conference on Network Protocols (ICNP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101792015","display_name":"Jin Fang","orcid":"https://orcid.org/0000-0002-8268-1879"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jin Fang","raw_affiliation_strings":["School of Computer Science and Technology, University of Science and Technology of China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074187380","display_name":"Gongming Zhao","orcid":"https://orcid.org/0000-0003-1311-8908"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Gongming Zhao","raw_affiliation_strings":["School of Computer Science and Technology, University of Science and Technology of China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063184427","display_name":"Hongli Xu","orcid":"https://orcid.org/0000-0003-3831-4577"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongli Xu","raw_affiliation_strings":["School of Computer Science and Technology, University of Science and Technology of China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087112835","display_name":"Luyao Luo","orcid":"https://orcid.org/0000-0002-6255-4370"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Luyao Luo","raw_affiliation_strings":["School of Computer Science and Technology, University of Science and Technology of China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069116683","display_name":"Zhen Yao","orcid":"https://orcid.org/0000-0001-8369-843X"},"institutions":[{"id":"https://openalex.org/I4210160618","display_name":"Huawei Technologies (United Kingdom)","ror":"https://ror.org/056gzgs71","country_code":"GB","type":"company","lineage":["https://openalex.org/I2250955327","https://openalex.org/I4210160618"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Zhen Yao","raw_affiliation_strings":["Huawei Technologies Co., Ltd"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Huawei Technologies Co., Ltd","institution_ids":["https://openalex.org/I4210160618"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5009713572","display_name":"An Xie","orcid":null},"institutions":[{"id":"https://openalex.org/I4210160618","display_name":"Huawei Technologies (United Kingdom)","ror":"https://ror.org/056gzgs71","country_code":"GB","type":"company","lineage":["https://openalex.org/I2250955327","https://openalex.org/I4210160618"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"An Xie","raw_affiliation_strings":["Huawei Technologies Co., Ltd"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Huawei Technologies Co., Ltd","institution_ids":["https://openalex.org/I4210160618"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.28959276,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"11"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12702","display_name":"Brain Tumor Detection and Classification","score":0.9760000109672546,"subfield":{"id":"https://openalex.org/subfields/2808","display_name":"Neurology"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T12702","display_name":"Brain Tumor Detection and Classification","score":0.9760000109672546,"subfield":{"id":"https://openalex.org/subfields/2808","display_name":"Neurology"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9710999727249146,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9495000243186951,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/idle","display_name":"Idle","score":0.8699731826782227},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7905615568161011},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5730580687522888},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5669865012168884},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.34263768792152405},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.2885943651199341}],"concepts":[{"id":"https://openalex.org/C16320812","wikidata":"https://www.wikidata.org/wiki/Q1812200","display_name":"Idle","level":2,"score":0.8699731826782227},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7905615568161011},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5730580687522888},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5669865012168884},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.34263768792152405},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.2885943651199341},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icnp61940.2024.10858582","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icnp61940.2024.10858582","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 32nd International Conference on Network Protocols (ICNP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Decent work and economic growth","score":0.6299999952316284,"id":"https://metadata.un.org/sdg/8"}],"awards":[{"id":"https://openalex.org/G2895427404","display_name":null,"funder_award_id":"62372426,62102392","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3961584101","display_name":null,"funder_award_id":"2023481","funder_id":"https://openalex.org/F4320322847","funder_display_name":"Youth Innovation Promotion Association of the Chinese Academy of Sciences"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320322847","display_name":"Youth Innovation Promotion Association of the Chinese Academy of Sciences","ror":"https://ror.org/031141b54"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W1997783781","https://openalex.org/W2000444555","https://openalex.org/W2118382442","https://openalex.org/W2130531694","https://openalex.org/W2194775991","https://openalex.org/W2500139799","https://openalex.org/W3016395792","https://openalex.org/W3036703963","https://openalex.org/W3042185737","https://openalex.org/W3154431492","https://openalex.org/W3175449831","https://openalex.org/W3197816522","https://openalex.org/W4221008227","https://openalex.org/W4224926627","https://openalex.org/W4283211930","https://openalex.org/W4318541537","https://openalex.org/W4321484191","https://openalex.org/W4372262787","https://openalex.org/W4386230822","https://openalex.org/W6727690538","https://openalex.org/W6739622702","https://openalex.org/W6781728138","https://openalex.org/W6785115463","https://openalex.org/W6793903029","https://openalex.org/W6811129797","https://openalex.org/W6862033441"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2974485871","https://openalex.org/W1577119738","https://openalex.org/W2908872315","https://openalex.org/W1600399803","https://openalex.org/W4235210722","https://openalex.org/W4388633481","https://openalex.org/W2994960476"],"abstract_inverted_index":{"Distributed":[0],"training":[1,16,91,188],"(DT)":[2],"has":[3],"emerged":[4],"as":[5,62,66],"a":[6,102,123,150,161,169],"solution":[7],"to":[8,32,46,68,77,200],"address":[9],"the":[10,83,86,109,116,130,140,192,195,201],"growing":[11],"computational":[12],"resource":[13,78,112],"demands":[14],"of":[15,85,111,118,173,194],"large-scale":[17,162],"machine":[18],"learning":[19],"models.":[20],"To":[21,138],"meet":[22],"this":[23,73,97],"need,":[24],"major":[25],"cloud":[26,43,94],"providers":[27,44],"typically":[28],"build":[29],"GPU":[30,87],"clusters":[31],"accommodate":[33],"DT":[34,40],"jobs.":[35],"Specifically,":[36],"for":[37,93],"an":[38,144],"incoming":[39],"job":[41,166],"request,":[42],"need":[45],"determine":[47],"in":[48],"which":[49,81],"GPUs":[50],"place":[51,59],"workers":[52,60],"(i.e.,":[53],"worker":[54,104],"placement).":[55],"Existing":[56],"approaches":[57],"usually":[58],"on":[61],"few":[63],"idle":[64],"machines":[65],"possible":[67],"minimize":[69],"communication":[70,132],"time.":[71],"However,":[72],"scheme":[74,106],"will":[75],"lead":[76],"fragmentation":[79,113],"problem,":[80,141],"degrades":[82],"efficiency":[84,193],"cluster":[88,196],"and":[89,134,168],"increases":[90],"costs":[92],"providers.":[95],"In":[96],"paper,":[98],"we":[99],"propose":[100],"Titan,":[101],"novel":[103],"placement":[105],"that":[107,128,183],"mitigates":[108],"influence":[110],"by":[114,197],"enhancing":[115],"utilization":[117],"non-idle":[119],"machines.":[120],"Titan":[121,142,159,184],"formulates":[122],"multi-objectives":[124],"non-linear":[125],"optimization":[126],"problem":[127],"incorporates":[129],"collective":[131],"constraint":[133],"proves":[135],"its":[136],"NP-hardness.":[137],"solve":[139],"presents":[143],"effective":[145],"submodular-based":[146],"greedy":[147],"algorithm":[148],"with":[149,160,176],"tight":[151],"approximation":[152],"ratio":[153],"(<tex":[154],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[155],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$1-\\frac{1}{e}$</tex>).":[156],"We":[157],"evaluate":[158],"simulation":[163],"employing":[164],"real-world":[165],"traces":[167],"small-scale":[170],"testbed":[171],"consisting":[172],"8":[174],"servers":[175],"32":[177],"logical":[178],"GPUs.":[179],"Experimental":[180],"results":[181],"show":[182],"can":[185],"achieve":[186],"near-optimal":[187],"throughput":[189],"while":[190],"improving":[191],"74.9%":[198],"compared":[199],"state-of-the-art":[202],"solutions.":[203]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
