{"id":"https://openalex.org/W4414170563","doi":"https://doi.org/10.1109/iwqos65803.2025.11143379","title":"CROP: Efficient and Robust Multi-Job Placement in Deep Learning Clusters","display_name":"CROP: Efficient and Robust Multi-Job Placement in Deep Learning Clusters","publication_year":2025,"publication_date":"2025-07-02","ids":{"openalex":"https://openalex.org/W4414170563","doi":"https://doi.org/10.1109/iwqos65803.2025.11143379"},"language":"en","primary_location":{"id":"doi:10.1109/iwqos65803.2025.11143379","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iwqos65803.2025.11143379","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM 33rd International Symposium on Quality of Service (IWQoS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5067923548","display_name":"Peng Yang","orcid":"https://orcid.org/0000-0002-1505-7857"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Peng Yang","raw_affiliation_strings":["School of Computer Science and Technology, University of Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074187380","display_name":"Gongming Zhao","orcid":"https://orcid.org/0000-0003-1311-8908"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Gongming Zhao","raw_affiliation_strings":["School of Computer Science and Technology, University of Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100715035","display_name":"Jing Wen","orcid":"https://orcid.org/0000-0003-3800-0105"},"institutions":[{"id":"https://openalex.org/I4210086088","display_name":"Guangxi Academy of Special Crops","ror":"https://ror.org/00gygke76","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210086088"]},{"id":"https://openalex.org/I4210112430","display_name":"Guangxi Zhuang Autonomous Region Health and Family Planning","ror":"https://ror.org/022z3g211","country_code":"CN","type":"government","lineage":["https://openalex.org/I4210112430"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jing Wen","raw_affiliation_strings":["Guangxi Zhuang Autonomous Region Information Center,Guangxi Key Laboratory of Digital Infrastructure"],"affiliations":[{"raw_affiliation_string":"Guangxi Zhuang Autonomous Region Information Center,Guangxi Key Laboratory of Digital Infrastructure","institution_ids":["https://openalex.org/I4210112430","https://openalex.org/I4210086088"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063184427","display_name":"Hongli Xu","orcid":"https://orcid.org/0000-0003-3831-4577"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hongli Xu","raw_affiliation_strings":["Suzhou Institute for Advanced Research, University of Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"Suzhou Institute for Advanced Research, University of Science and Technology of China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102773315","display_name":"Haibo Wang","orcid":"https://orcid.org/0000-0001-7866-4171"},"institutions":[{"id":"https://openalex.org/I143302722","display_name":"University of Kentucky","ror":"https://ror.org/02k3smh20","country_code":"US","type":"education","lineage":["https://openalex.org/I143302722"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Haibo Wang","raw_affiliation_strings":["University of Kentucky"],"affiliations":[{"raw_affiliation_string":"University of Kentucky","institution_ids":["https://openalex.org/I143302722"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026477293","display_name":"Wentao Fan","orcid":"https://orcid.org/0000-0001-7671-7831"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wentao Fan","raw_affiliation_strings":["China Mobile (Suzhou) Software Technology Co. Ltd"],"affiliations":[{"raw_affiliation_string":"China Mobile (Suzhou) Software Technology Co. Ltd","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108403100","display_name":"Xiaohu Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaohu Xu","raw_affiliation_strings":["China Mobile (Suzhou) Software Technology Co. Ltd"],"affiliations":[{"raw_affiliation_string":"China Mobile (Suzhou) Software Technology Co. Ltd","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5086781975","display_name":"Jun Yao","orcid":"https://orcid.org/0000-0002-9133-7017"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jun Yao","raw_affiliation_strings":["China Mobile (Suzhou) Software Technology Co. Ltd"],"affiliations":[{"raw_affiliation_string":"China Mobile (Suzhou) Software Technology Co. Ltd","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5067923548"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.2546085,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.6887999773025513},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6265000104904175},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.6074000000953674},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.5667999982833862},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.4758000075817108},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4410000145435333},{"id":"https://openalex.org/keywords/cluster","display_name":"Cluster (spacecraft)","score":0.4120999872684479}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7857000231742859},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.6887999773025513},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6265000104904175},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.6074000000953674},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.5667999982833862},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5573999881744385},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5440000295639038},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.4758000075817108},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4410000145435333},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.4120999872684479},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.3984000086784363},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3912000060081482},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.3808000087738037},{"id":"https://openalex.org/C29140674","wikidata":"https://www.wikidata.org/wiki/Q206637","display_name":"Computer cluster","level":2,"score":0.3596999943256378},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.35679998993873596},{"id":"https://openalex.org/C2779582901","wikidata":"https://www.wikidata.org/wiki/Q21013010","display_name":"Distributed learning","level":2,"score":0.32109999656677246},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.30799999833106995},{"id":"https://openalex.org/C51823790","wikidata":"https://www.wikidata.org/wiki/Q504353","display_name":"Greedy algorithm","level":2,"score":0.2939999997615814},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.26649999618530273},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.2597000002861023}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iwqos65803.2025.11143379","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iwqos65803.2025.11143379","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM 33rd International Symposium on Quality of Service (IWQoS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W1686810756","https://openalex.org/W1903029394","https://openalex.org/W1997783781","https://openalex.org/W2000444555","https://openalex.org/W2034673442","https://openalex.org/W2057332538","https://openalex.org/W2111281854","https://openalex.org/W2118382442","https://openalex.org/W2131613942","https://openalex.org/W2194775991","https://openalex.org/W2500139799","https://openalex.org/W2896457183","https://openalex.org/W3037377931","https://openalex.org/W3091097978","https://openalex.org/W3129927603","https://openalex.org/W3160031425","https://openalex.org/W3190774216","https://openalex.org/W4245040028","https://openalex.org/W4283211930","https://openalex.org/W4283219828","https://openalex.org/W4288089799","https://openalex.org/W4293117014","https://openalex.org/W4394923413"],"related_works":[],"abstract_inverted_index":{"Deep":[0],"learning":[1],"(DL)":[2],"has":[3],"seen":[4],"a":[5,20,50,118,136,148,157],"growing":[6],"dataset,":[7],"an":[8,96,131],"expanding":[9],"model":[10],"scale,":[11],"and":[12,71,101,112,121,156],"increasing":[13],"applications":[14],"in":[15,66,75,86,105],"recent":[16],"years.":[17],"There":[18],"is":[19],"notable":[21],"trend":[22],"of":[23,140,152,178],"shifting":[24],"DL":[25,34,47,69,106,180],"training":[26,44,176],"jobs":[27,48],"from":[28],"local":[29],"computing":[30],"units":[31],"to":[32,46,53,62,184,189],"powerful":[33],"clusters":[35,41],"built":[36],"by":[37,182],"cloud":[38],"providers.":[39],"These":[40],"allocate":[42],"physical":[43,154],"nodes":[45],"through":[49],"process":[51],"referred":[52],"as":[54,117],"multi-job":[55,58,103,114],"placement.":[56],"Existing":[57],"placement":[59,104,115],"strategies":[60],"fail":[61],"achieve":[63],"high":[64],"efficiency":[65],"resource":[67],"utilization,":[68],"training,":[70],"robustness":[72],"simultaneously,":[73],"resulting":[74],"poor":[76],"performance":[77],"when":[78,83],"resources":[79],"are":[80],"limited":[81],"or":[82],"abnormalities":[84],"occur":[85],"some":[87],"devices.":[88],"To":[89,125],"tackle":[90],"these":[91],"challenges,":[92],"we":[93,129],"present":[94,130],"CROP,":[95],"approach":[97],"that":[98,167],"performs":[99],"efficient":[100,111],"robust":[102,113],"clusters.":[107],"We":[108,144],"formulate":[109],"the":[110,175,179],"problem":[116],"non-linear":[119],"program":[120],"prove":[122],"its":[123],"NP-hardness.":[124],"solve":[126],"this":[127],"problem,":[128],"effective":[132],"submodular-based":[133],"algorithm":[134],"with":[135],"tight":[137],"approximation":[138],"factor":[139],"(<tex":[141],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[142,186],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$1-1/e$</tex>).":[143],"evaluate":[145],"CROP":[146,168],"on":[147],"small-scale":[149],"testbed":[150],"consisting":[151],"8":[153],"GPUs":[155],"large-scale":[158],"simulation":[159],"employing":[160],"real-world":[161],"job":[162],"traces.":[163],"Experimental":[164],"results":[165],"demonstrate":[166],"achieves":[169],"nearoptimal":[170],"communication":[171],"overhead":[172],"while":[173],"improving":[174],"throughput":[177],"cluster":[181],"up":[183],"<tex":[185],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$57.5\\%$</tex>":[187],"compared":[188],"state-of-the-art":[190],"solutions.":[191]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-10T00:00:00"}
