{"id":"https://openalex.org/W4206647593","doi":"https://doi.org/10.1109/tnsm.2021.3132361","title":"Beamer: Stage-Aware Coflow Scheduling to Accelerate Hyper-Parameter Tuning in Deep Learning Clusters","display_name":"Beamer: Stage-Aware Coflow Scheduling to Accelerate Hyper-Parameter Tuning in Deep Learning Clusters","publication_year":2021,"publication_date":"2021-12-03","ids":{"openalex":"https://openalex.org/W4206647593","doi":"https://doi.org/10.1109/tnsm.2021.3132361"},"language":"en","primary_location":{"id":"doi:10.1109/tnsm.2021.3132361","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tnsm.2021.3132361","pdf_url":null,"source":{"id":"https://openalex.org/S173527311","display_name":"IEEE Transactions on Network and Service Management","issn_l":"1932-4537","issn":["1932-4537","2373-7379"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Network and Service Management","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073218808","display_name":"Yihong He","orcid":"https://orcid.org/0000-0001-5253-1995"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yihong He","raw_affiliation_strings":["Key Laboratory of Optical Fiber Sensing and Communications (Ministry of Education), University of Electronic Science and Technology of China, Chengdu, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Optical Fiber Sensing and Communications (Ministry of Education), University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085412582","display_name":"Weibo Cai","orcid":"https://orcid.org/0000-0003-4641-0833"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weibo Cai","raw_affiliation_strings":["Key Laboratory of Optical Fiber Sensing and Communications (Ministry of Education), University of Electronic Science and Technology of China, Chengdu, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Optical Fiber Sensing and Communications (Ministry of Education), University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020559505","display_name":"Pan Zhou","orcid":"https://orcid.org/0000-0003-3373-728X"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pan Zhou","raw_affiliation_strings":["Key Laboratory of Optical Fiber Sensing and Communications (Ministry of Education), University of Electronic Science and Technology of China, Chengdu, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Optical Fiber Sensing and Communications (Ministry of Education), University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053945147","display_name":"Gang Sun","orcid":"https://orcid.org/0000-0002-2448-8915"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Gang Sun","raw_affiliation_strings":["Key Laboratory of Optical Fiber Sensing and Communications (Ministry of Education), University of Electronic Science and Technology of China, Chengdu, China","Agile and Intelligent Computing Key Laboratory of Sichuan Province, Chengdu, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Optical Fiber Sensing and Communications (Ministry of Education), University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]},{"raw_affiliation_string":"Agile and Intelligent Computing Key Laboratory of Sichuan Province, Chengdu, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075454046","display_name":"Shouxi Luo","orcid":"https://orcid.org/0000-0002-4041-3681"},"institutions":[{"id":"https://openalex.org/I4800084","display_name":"Southwest Jiaotong University","ror":"https://ror.org/00hn7w693","country_code":"CN","type":"education","lineage":["https://openalex.org/I4800084"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shouxi Luo","raw_affiliation_strings":["School of Computing and Artificial Intelligence, Southwest Jiaotong University, Chengdu, China"],"affiliations":[{"raw_affiliation_string":"School of Computing and Artificial Intelligence, Southwest Jiaotong University, Chengdu, China","institution_ids":["https://openalex.org/I4800084"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101507232","display_name":"Hongfang Yu","orcid":"https://orcid.org/0000-0002-5219-1780"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]},{"id":"https://openalex.org/I4210136793","display_name":"Peng Cheng Laboratory","ror":"https://ror.org/03qdqbt06","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210136793"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongfang Yu","raw_affiliation_strings":["Key Laboratory of Optical Fiber Sensing and Communications (Ministry of Education), University of Electronic Science and Technology of China, Chengdu, China","Peng Cheng Laboratory, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Optical Fiber Sensing and Communications (Ministry of Education), University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]},{"raw_affiliation_string":"Peng Cheng Laboratory, Shenzhen, China","institution_ids":["https://openalex.org/I4210136793"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5057916222","display_name":"Mohsen Guizani","orcid":"https://orcid.org/0000-0002-8972-8094"},"institutions":[{"id":"https://openalex.org/I4210113480","display_name":"Mohamed bin Zayed University of Artificial Intelligence","ror":"https://ror.org/0258gkt32","country_code":"AE","type":"education","lineage":["https://openalex.org/I4210113480"]}],"countries":["AE"],"is_corresponding":false,"raw_author_name":"Mohsen Guizani","raw_affiliation_strings":["Machine Learning Department, MBZUAI, Abu Dhabi, UAE"],"affiliations":[{"raw_affiliation_string":"Machine Learning Department, MBZUAI, Abu Dhabi, UAE","institution_ids":["https://openalex.org/I4210113480"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5073218808"],"corresponding_institution_ids":["https://openalex.org/I150229711"],"apc_list":null,"apc_paid":null,"fwci":0.6725,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.72168301,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":"19","issue":"2","first_page":"1083","last_page":"1097"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.808535635471344},{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.7301393747329712},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.671345591545105},{"id":"https://openalex.org/keywords/retraining","display_name":"Retraining","score":0.5695875287055969},{"id":"https://openalex.org/keywords/fifo","display_name":"FIFO (computing and electronics)","score":0.5555576682090759},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4531155824661255},{"id":"https://openalex.org/keywords/traverse","display_name":"Traverse","score":0.43463775515556335},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.4338001608848572},{"id":"https://openalex.org/keywords/real-time-computing","display_name":"Real-time computing","score":0.39505326747894287},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3851628303527832},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.21898645162582397},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.12785878777503967},{"id":"https://openalex.org/keywords/mathematical-optimization","display_name":"Mathematical optimization","score":0.12784430384635925}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.808535635471344},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.7301393747329712},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.671345591545105},{"id":"https://openalex.org/C2778712577","wikidata":"https://www.wikidata.org/wiki/Q3505966","display_name":"Retraining","level":2,"score":0.5695875287055969},{"id":"https://openalex.org/C2777145635","wikidata":"https://www.wikidata.org/wiki/Q515636","display_name":"FIFO (computing and electronics)","level":2,"score":0.5555576682090759},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4531155824661255},{"id":"https://openalex.org/C176809094","wikidata":"https://www.wikidata.org/wiki/Q15401496","display_name":"Traverse","level":2,"score":0.43463775515556335},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4338001608848572},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.39505326747894287},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3851628303527832},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.21898645162582397},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.12785878777503967},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.12784430384635925},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C155202549","wikidata":"https://www.wikidata.org/wiki/Q178803","display_name":"International trade","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tnsm.2021.3132361","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tnsm.2021.3132361","pdf_url":null,"source":{"id":"https://openalex.org/S173527311","display_name":"IEEE Transactions on Network and Service Management","issn_l":"1932-4537","issn":["1932-4537","2373-7379"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Network and Service Management","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.47999998927116394,"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure"}],"awards":[{"id":"https://openalex.org/G6636833293","display_name":null,"funder_award_id":"(62102066)","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7300317393","display_name":null,"funder_award_id":"(2019YFB1802800)","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":44,"referenced_works":["https://openalex.org/W169464267","https://openalex.org/W1960330418","https://openalex.org/W1982055457","https://openalex.org/W2075265959","https://openalex.org/W2097998348","https://openalex.org/W2169699126","https://openalex.org/W2556522401","https://openalex.org/W2594041104","https://openalex.org/W2608595939","https://openalex.org/W2732547613","https://openalex.org/W2798515322","https://openalex.org/W2809420642","https://openalex.org/W2824896242","https://openalex.org/W2883265831","https://openalex.org/W2899071864","https://openalex.org/W2906978492","https://openalex.org/W2919594608","https://openalex.org/W2920397365","https://openalex.org/W2952435244","https://openalex.org/W2962725887","https://openalex.org/W2962758826","https://openalex.org/W2963403751","https://openalex.org/W2963968531","https://openalex.org/W2965898445","https://openalex.org/W2987954639","https://openalex.org/W3004561114","https://openalex.org/W3015537910","https://openalex.org/W3023359584","https://openalex.org/W3138800689","https://openalex.org/W3160520312","https://openalex.org/W4213308398","https://openalex.org/W4232284301","https://openalex.org/W4255119735","https://openalex.org/W4289422886","https://openalex.org/W6674385629","https://openalex.org/W6730169791","https://openalex.org/W6739693220","https://openalex.org/W6752199355","https://openalex.org/W6753505025","https://openalex.org/W6755283958","https://openalex.org/W6756009870","https://openalex.org/W6758283263","https://openalex.org/W6759814162","https://openalex.org/W6766182666"],"related_works":["https://openalex.org/W2377402383","https://openalex.org/W2380835401","https://openalex.org/W2381912691","https://openalex.org/W2350381577","https://openalex.org/W2353618196","https://openalex.org/W2348074676","https://openalex.org/W2385033175","https://openalex.org/W2374043190","https://openalex.org/W2363298784","https://openalex.org/W2367402697"],"abstract_inverted_index":{"Training":[0],"a":[1,122,175],"neural":[2],"network":[3,82,191],"requires":[4],"retraining":[5],"the":[6,14,19,37,72,86,93,99,129,153,159,162,168,172],"same":[7],"model":[8,75],"many":[9],"times":[10],"to":[11,26,95,107,114,127,139,152],"search":[12,97],"for":[13,98],"configuration":[15],"of":[16,39,77,167],"hyper-parameters":[17],"with":[18,43,51],"best":[20],"training":[21,29,61,115],"result.":[22],"It":[23],"is":[24,137],"common":[25],"launch":[27],"multiple":[28,63],"jobs":[30,42,50,79],"and":[31,49,65,91,146,183,197],"evaluate":[32],"them":[33],"in":[34,174],"stages.":[35,116],"At":[36],"completion":[38,88],"each":[40],"stage,":[41],"unpromising":[44],"configurations":[45,53],"will":[46,54],"be":[47],"terminated":[48],"new":[52],"start.":[55],"Each":[56],"job":[57],"typically":[58],"performs":[59],"distributed":[60],"across":[62],"GPUs,":[64],"GPUs":[66],"periodically":[67],"synchronize":[68],"their":[69],"models":[70],"over":[71],"network.":[73],"However,":[74],"synchronizations":[76],"running":[78],"cause":[80],"severe":[81],"congestion,":[83],"significantly":[84,188],"increasing":[85],"stage":[87,144],"time":[89,94],"(SCT)":[90],"thus":[92],"successfully":[96],"desired":[100],"configuration.":[101],"Existing":[102],"flow":[103],"schedulers":[104],"are":[105,112,149],"ineffective":[106],"reduce":[108],"SCT":[109,164],"since":[110],"they":[111],"agnostic":[113],"In":[117,132],"this":[118,133],"paper,":[119],"we":[120],"propose":[121],"stage-aware":[123],"coflow":[124],"scheduling":[125],"method":[126,160,173],"minimize":[128],"average":[130,163],"SCT.":[131],"method,":[134],"an":[135],"algorithm":[136],"designed":[138],"order":[140],"coflows":[141,148],"by":[142],"considering":[143],"information":[145],"then":[147],"scheduled":[150],"according":[151],"order.":[154],"Mathematical":[155],"analysis":[156],"shows":[157],"that":[158,186],"achieves":[161],"within":[165],"20/3":[166],"optimal.":[169],"We":[170],"implement":[171],"real":[176],"system":[177],"called":[178],"Beamer.":[179],"Extensive":[180],"testbed":[181],"experiments":[182],"simulations":[184],"show":[185],"Beamer":[187],"outperforms":[189],"advanced":[190],"designs,":[192],"such":[193],"as":[194],"Sincronia,":[195],"FIFO-LM,":[196],"per-flow":[198],"fair":[199],"sharing.":[200]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
