{"id":"https://openalex.org/W2060617594","doi":"https://doi.org/10.1145/2749246.2749249","title":"Towards Scalable Distributed Workload Manager with Monitoring-Based Weakly Consistent Resource Stealing","display_name":"Towards Scalable Distributed Workload Manager with Monitoring-Based Weakly Consistent Resource Stealing","publication_year":2015,"publication_date":"2015-06-08","ids":{"openalex":"https://openalex.org/W2060617594","doi":"https://doi.org/10.1145/2749246.2749249","mag":"2060617594"},"language":"en","primary_location":{"id":"doi:10.1145/2749246.2749249","is_oa":true,"landing_page_url":"https://doi.org/10.1145/2749246.2749249","pdf_url":"http://dl.acm.org/ft_gateway.cfm?id=2749249&type=pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 24th International Symposium on High-Performance Parallel and Distributed Computing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"http://dl.acm.org/ft_gateway.cfm?id=2749249&type=pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5040414354","display_name":"Ke Wang","orcid":"https://orcid.org/0000-0002-8306-1663"},"institutions":[{"id":"https://openalex.org/I180949307","display_name":"Illinois Institute of Technology","ror":"https://ror.org/037t3ry66","country_code":"US","type":"education","lineage":["https://openalex.org/I180949307"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ke Wang","raw_affiliation_strings":["Illinois Institute of Technology, Chicago, IL, USA","[Illinois Institute of Technology, Chicago, IL, USA.]"],"affiliations":[{"raw_affiliation_string":"Illinois Institute of Technology, Chicago, IL, USA","institution_ids":["https://openalex.org/I180949307"]},{"raw_affiliation_string":"[Illinois Institute of Technology, Chicago, IL, USA.]","institution_ids":["https://openalex.org/I180949307"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060532041","display_name":"Xiaobing Zhou","orcid":"https://orcid.org/0000-0003-1983-0971"},"institutions":[{"id":"https://openalex.org/I4210106576","display_name":"Hortonworks (United States)","ror":"https://ror.org/01bctgj87","country_code":"US","type":"company","lineage":["https://openalex.org/I4210106576"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiaobing Zhou","raw_affiliation_strings":["Hortonworks Inc., Palo Alto, CA, USA"],"affiliations":[{"raw_affiliation_string":"Hortonworks Inc., Palo Alto, CA, USA","institution_ids":["https://openalex.org/I4210106576"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109800827","display_name":"Kan Qiao","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kan Qiao","raw_affiliation_strings":["Google, Seattle, WA, USA","Google, Seattle, WA, USA#TAB#"],"affiliations":[{"raw_affiliation_string":"Google, Seattle, WA, USA","institution_ids":["https://openalex.org/I1291425158"]},{"raw_affiliation_string":"Google, Seattle, WA, USA#TAB#","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078475557","display_name":"Michael Lang","orcid":"https://orcid.org/0000-0002-3498-6352"},"institutions":[{"id":"https://openalex.org/I1343871089","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1343871089","https://openalex.org/I198811213","https://openalex.org/I4210120050"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Michael Lang","raw_affiliation_strings":["Los Alamos National Laboratory, Los Alamos, NM, USA","Los Alamos National Laboratory,Los Alamos, NM, USA"],"affiliations":[{"raw_affiliation_string":"Los Alamos National Laboratory, Los Alamos, NM, USA","institution_ids":["https://openalex.org/I1343871089"]},{"raw_affiliation_string":"Los Alamos National Laboratory,Los Alamos, NM, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013587090","display_name":"Benjamin McClelland","orcid":null},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Benjamin McClelland","raw_affiliation_strings":["Intel, Hillsboro, OR, USA","[Intel, Hillsboro, OR, USA]"],"affiliations":[{"raw_affiliation_string":"Intel, Hillsboro, OR, USA","institution_ids":["https://openalex.org/I1343180700"]},{"raw_affiliation_string":"[Intel, Hillsboro, OR, USA]","institution_ids":["https://openalex.org/I1343180700"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5030599558","display_name":"Ioan Raicu","orcid":"https://orcid.org/0000-0002-5477-439X"},"institutions":[{"id":"https://openalex.org/I180949307","display_name":"Illinois Institute of Technology","ror":"https://ror.org/037t3ry66","country_code":"US","type":"education","lineage":["https://openalex.org/I180949307"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ioan Raicu","raw_affiliation_strings":["Illinois Institute of Technology, Chicago, IL, USA","[Illinois Institute of Technology, Chicago, IL, USA.]"],"affiliations":[{"raw_affiliation_string":"Illinois Institute of Technology, Chicago, IL, USA","institution_ids":["https://openalex.org/I180949307"]},{"raw_affiliation_string":"[Illinois Institute of Technology, Chicago, IL, USA.]","institution_ids":["https://openalex.org/I180949307"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5040414354"],"corresponding_institution_ids":["https://openalex.org/I180949307"],"apc_list":null,"apc_paid":null,"fwci":14.0147,"has_fulltext":true,"cited_by_count":45,"citation_normalized_percentile":{"value":0.98940155,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"219","last_page":"222"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9951000213623047,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8211961984634399},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.784031093120575},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7793962955474854},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.7557045817375183},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6593860387802124},{"id":"https://openalex.org/keywords/job-scheduler","display_name":"Job scheduler","score":0.6565341949462891},{"id":"https://openalex.org/keywords/partition","display_name":"Partition (number theory)","score":0.5918487906455994},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.539943516254425},{"id":"https://openalex.org/keywords/load-balancing","display_name":"Load balancing (electrical power)","score":0.48744848370552063},{"id":"https://openalex.org/keywords/resource-allocation","display_name":"Resource allocation","score":0.4653027653694153},{"id":"https://openalex.org/keywords/resource-management","display_name":"Resource management (computing)","score":0.46214473247528076},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.45000696182250977},{"id":"https://openalex.org/keywords/processor-scheduling","display_name":"Processor scheduling","score":0.41013622283935547},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.257160484790802},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.19305464625358582},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.16344556212425232}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8211961984634399},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.784031093120575},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7793962955474854},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.7557045817375183},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6593860387802124},{"id":"https://openalex.org/C111873713","wikidata":"https://www.wikidata.org/wiki/Q1641413","display_name":"Job scheduler","level":3,"score":0.6565341949462891},{"id":"https://openalex.org/C42812","wikidata":"https://www.wikidata.org/wiki/Q1082910","display_name":"Partition (number theory)","level":2,"score":0.5918487906455994},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.539943516254425},{"id":"https://openalex.org/C138959212","wikidata":"https://www.wikidata.org/wiki/Q1806783","display_name":"Load balancing (electrical power)","level":3,"score":0.48744848370552063},{"id":"https://openalex.org/C29202148","wikidata":"https://www.wikidata.org/wiki/Q287260","display_name":"Resource allocation","level":2,"score":0.4653027653694153},{"id":"https://openalex.org/C2780609101","wikidata":"https://www.wikidata.org/wiki/Q17156588","display_name":"Resource management (computing)","level":2,"score":0.46214473247528076},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.45000696182250977},{"id":"https://openalex.org/C2984822820","wikidata":"https://www.wikidata.org/wiki/Q1123036","display_name":"Processor scheduling","level":3,"score":0.41013622283935547},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.257160484790802},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.19305464625358582},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.16344556212425232},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C187691185","wikidata":"https://www.wikidata.org/wiki/Q2020720","display_name":"Grid","level":2,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/2749246.2749249","is_oa":true,"landing_page_url":"https://doi.org/10.1145/2749246.2749249","pdf_url":"http://dl.acm.org/ft_gateway.cfm?id=2749249&type=pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 24th International Symposium on High-Performance Parallel and Distributed Computing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/2749246.2749249","is_oa":true,"landing_page_url":"https://doi.org/10.1145/2749246.2749249","pdf_url":"http://dl.acm.org/ft_gateway.cfm?id=2749249&type=pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 24th International Symposium on High-Performance Parallel and Distributed Computing","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/8","display_name":"Decent work and economic growth","score":0.6299999952316284}],"awards":[{"id":"https://openalex.org/G2340147074","display_name":null,"funder_award_id":"CNS-1042543","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G5657628048","display_name":null,"funder_award_id":"AC52-06NA25396","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G848032724","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8496933687","display_name":"Collaborative Research: PRObE - The NSF Parallel Reconfigurable Observational Environment for Data Intensive Super-Computing and High End Computing","funder_award_id":"1042543","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2060617594.pdf","grobid_xml":"https://content.openalex.org/works/W2060617594.grobid-xml"},"referenced_works_count":22,"referenced_works":["https://openalex.org/W114584783","https://openalex.org/W1565868544","https://openalex.org/W1580503671","https://openalex.org/W1589918049","https://openalex.org/W1596936080","https://openalex.org/W1603731243","https://openalex.org/W2008559335","https://openalex.org/W2020653682","https://openalex.org/W2021876388","https://openalex.org/W2076449099","https://openalex.org/W2079031056","https://openalex.org/W2091327926","https://openalex.org/W2103803358","https://openalex.org/W2108841814","https://openalex.org/W2123134606","https://openalex.org/W2135377936","https://openalex.org/W2146381930","https://openalex.org/W2168067900","https://openalex.org/W2184628147","https://openalex.org/W2289254904","https://openalex.org/W2594899909","https://openalex.org/W4285719527"],"related_works":["https://openalex.org/W2923452570","https://openalex.org/W206598027","https://openalex.org/W2978610750","https://openalex.org/W2022931285","https://openalex.org/W1589966275","https://openalex.org/W2086872282","https://openalex.org/W2137789903","https://openalex.org/W2534160330","https://openalex.org/W2153007255","https://openalex.org/W2987590351"],"abstract_inverted_index":{"One":[0],"way":[1],"to":[2,10,50,99,144],"efficiently":[3],"utilize":[4],"the":[5,24,28,60,110,141,147],"coming":[6],"exascale":[7],"machines":[8],"is":[9],"support":[11],"a":[12,53,73,92],"mixture":[13],"of":[14,45,75],"applications":[15],"in":[16,36,80,103,112,132,154],"various":[17],"domains,":[18],"such":[19],"as":[20,146],"traditional":[21],"large-scale":[22],"HPC,":[23],"ensemble":[25],"runs,":[26],"and":[27,40,78,108,135,150],"fine-grained":[29],"many-task":[30],"computing":[31,157],"(MTC).":[32],"Delivering":[33],"high":[34],"performance":[35,142],"resource":[37,81,84,96,101],"allocation,":[38],"scheduling":[39],"launching":[41,136],"for":[42],"all":[43],"types":[44],"jobs":[46,137],"has":[47],"driven":[48],"us":[49],"develop":[51],"Slurm++,":[52],"distributed":[54,104],"workload":[55],"manager":[56],"directly":[57],"extended":[58],"from":[59],"Slurm":[61,118,131],"centralized":[62],"production":[63],"system.":[64],"Slurm++":[65,116,126],"employs":[66],"multiple":[67],"controllers":[68],"with":[69,117,122],"each":[70],"one":[71],"managing":[72],"partition":[74],"compute":[76],"nodes":[77],"participating":[79],"allocation":[82],"through":[83],"balancing":[85,102],"techniques.":[86],"In":[87],"this":[88],"paper,":[89],"we":[90,139],"propose":[91],"monitoring-based":[93],"weakly":[94],"consistent":[95],"stealing":[97],"technique":[98,111],"achieve":[100],"HPC":[105],"job":[106,124,148],"launch,":[107],"implement":[109],"Slurm++.":[113],"We":[114],"compare":[115],"using":[119],"micro-benchmark":[120],"workloads":[121],"different":[123],"sizes.":[125],"showed":[127],"10X":[128],"faster":[129],"than":[130],"allocating":[133],"resources":[134],"--":[138],"expect":[140],"gap":[143],"grow":[145],"sizes":[149],"system":[151],"scales":[152],"increase":[153],"future":[155],"high-end":[156],"systems.":[158]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2022,"cited_by_count":4},{"year":2018,"cited_by_count":3},{"year":2017,"cited_by_count":3},{"year":2016,"cited_by_count":13},{"year":2015,"cited_by_count":21}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
