{"id":"https://openalex.org/W4200160368","doi":"https://doi.org/10.1109/hpec49654.2021.9622853","title":"Exploring the Tradeoff Between Reliability and Performance in HPC Systems","display_name":"Exploring the Tradeoff Between Reliability and Performance in HPC Systems","publication_year":2021,"publication_date":"2021-09-20","ids":{"openalex":"https://openalex.org/W4200160368","doi":"https://doi.org/10.1109/hpec49654.2021.9622853"},"language":"en","primary_location":{"id":"doi:10.1109/hpec49654.2021.9622853","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpec49654.2021.9622853","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE High Performance Extreme Computing Conference (HPEC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5041160480","display_name":"Craig Walker","orcid":null},"institutions":[{"id":"https://openalex.org/I4210152127","display_name":"Conway School of Landscape Design","ror":"https://ror.org/04q7y8a54","country_code":"US","type":"education","lineage":["https://openalex.org/I4210152127"]},{"id":"https://openalex.org/I208081647","display_name":"Coastal Carolina University","ror":"https://ror.org/01621q256","country_code":"US","type":"education","lineage":["https://openalex.org/I208081647"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Craig Walker","raw_affiliation_strings":["Coastal Carolina University, Conway, SC, USA"],"affiliations":[{"raw_affiliation_string":"Coastal Carolina University, Conway, SC, USA","institution_ids":["https://openalex.org/I208081647","https://openalex.org/I4210152127"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035265406","display_name":"Braeden Slade","orcid":null},"institutions":[{"id":"https://openalex.org/I1343871089","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1343871089","https://openalex.org/I198811213","https://openalex.org/I4210120050"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Braeden Slade","raw_affiliation_strings":["Ultrascale Systems Research Center, Los Alamos National Laboratory, Los Alamos, NM, USA"],"affiliations":[{"raw_affiliation_string":"Ultrascale Systems Research Center, Los Alamos National Laboratory, Los Alamos, NM, USA","institution_ids":["https://openalex.org/I1343871089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064216355","display_name":"Gavin Bailey","orcid":"https://orcid.org/0000-0002-7361-9539"},"institutions":[{"id":"https://openalex.org/I4210152127","display_name":"Conway School of Landscape Design","ror":"https://ror.org/04q7y8a54","country_code":"US","type":"education","lineage":["https://openalex.org/I4210152127"]},{"id":"https://openalex.org/I208081647","display_name":"Coastal Carolina University","ror":"https://ror.org/01621q256","country_code":"US","type":"education","lineage":["https://openalex.org/I208081647"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Gavin Bailey","raw_affiliation_strings":["Coastal Carolina University, Conway, SC, USA"],"affiliations":[{"raw_affiliation_string":"Coastal Carolina University, Conway, SC, USA","institution_ids":["https://openalex.org/I208081647","https://openalex.org/I4210152127"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042837253","display_name":"Nicklaus Przybylski","orcid":null},"institutions":[{"id":"https://openalex.org/I208081647","display_name":"Coastal Carolina University","ror":"https://ror.org/01621q256","country_code":"US","type":"education","lineage":["https://openalex.org/I208081647"]},{"id":"https://openalex.org/I4210152127","display_name":"Conway School of Landscape Design","ror":"https://ror.org/04q7y8a54","country_code":"US","type":"education","lineage":["https://openalex.org/I4210152127"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nicklaus Przybylski","raw_affiliation_strings":["Coastal Carolina University, Conway, SC, USA"],"affiliations":[{"raw_affiliation_string":"Coastal Carolina University, Conway, SC, USA","institution_ids":["https://openalex.org/I208081647","https://openalex.org/I4210152127"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056569157","display_name":"Nathan DeBardeleben","orcid":"https://orcid.org/0000-0002-5593-9205"},"institutions":[{"id":"https://openalex.org/I1343871089","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1343871089","https://openalex.org/I198811213","https://openalex.org/I4210120050"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nathan DeBardeleben","raw_affiliation_strings":["Ultrascale Systems Research Center, Los Alamos National Laboratory, Los Alamos, NM, USA"],"affiliations":[{"raw_affiliation_string":"Ultrascale Systems Research Center, Los Alamos National Laboratory, Los Alamos, NM, USA","institution_ids":["https://openalex.org/I1343871089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5062187807","display_name":"William M. Jones","orcid":"https://orcid.org/0000-0002-3796-6330"},"institutions":[{"id":"https://openalex.org/I4210152127","display_name":"Conway School of Landscape Design","ror":"https://ror.org/04q7y8a54","country_code":"US","type":"education","lineage":["https://openalex.org/I4210152127"]},{"id":"https://openalex.org/I208081647","display_name":"Coastal Carolina University","ror":"https://ror.org/01621q256","country_code":"US","type":"education","lineage":["https://openalex.org/I208081647"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"William M. Jones","raw_affiliation_strings":["Coastal Carolina University, Conway, SC, USA"],"affiliations":[{"raw_affiliation_string":"Coastal Carolina University, Conway, SC, USA","institution_ids":["https://openalex.org/I208081647","https://openalex.org/I4210152127"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5041160480"],"corresponding_institution_ids":["https://openalex.org/I208081647","https://openalex.org/I4210152127"],"apc_list":null,"apc_paid":null,"fwci":0.9169,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.7689678,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7880739569664001},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.7167521119117737},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.6630319356918335},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.6275694370269775},{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.5748714804649353},{"id":"https://openalex.org/keywords/grid","display_name":"Grid","score":0.5534235239028931},{"id":"https://openalex.org/keywords/grid-computing","display_name":"Grid computing","score":0.5528360605239868},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.5488319396972656},{"id":"https://openalex.org/keywords/procurement","display_name":"Procurement","score":0.5348618626594543},{"id":"https://openalex.org/keywords/job-scheduler","display_name":"Job scheduler","score":0.5143831372261047},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.4846135079860687},{"id":"https://openalex.org/keywords/reliability-engineering","display_name":"Reliability engineering","score":0.44648388028144836},{"id":"https://openalex.org/keywords/job-shop-scheduling","display_name":"Job shop scheduling","score":0.42051151394844055},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.2429208755493164},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.22069096565246582},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.1653364598751068},{"id":"https://openalex.org/keywords/routing","display_name":"Routing (electronic design automation)","score":0.13412103056907654}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7880739569664001},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.7167521119117737},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.6630319356918335},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.6275694370269775},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.5748714804649353},{"id":"https://openalex.org/C187691185","wikidata":"https://www.wikidata.org/wiki/Q2020720","display_name":"Grid","level":2,"score":0.5534235239028931},{"id":"https://openalex.org/C70429105","wikidata":"https://www.wikidata.org/wiki/Q249999","display_name":"Grid computing","level":3,"score":0.5528360605239868},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5488319396972656},{"id":"https://openalex.org/C201650216","wikidata":"https://www.wikidata.org/wiki/Q829492","display_name":"Procurement","level":2,"score":0.5348618626594543},{"id":"https://openalex.org/C111873713","wikidata":"https://www.wikidata.org/wiki/Q1641413","display_name":"Job scheduler","level":3,"score":0.5143831372261047},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.4846135079860687},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.44648388028144836},{"id":"https://openalex.org/C55416958","wikidata":"https://www.wikidata.org/wiki/Q6206757","display_name":"Job shop scheduling","level":3,"score":0.42051151394844055},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2429208755493164},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.22069096565246582},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.1653364598751068},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.13412103056907654},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C162853370","wikidata":"https://www.wikidata.org/wiki/Q39809","display_name":"Marketing","level":1,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpec49654.2021.9622853","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpec49654.2021.9622853","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE High Performance Extreme Computing Conference (HPEC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/8","display_name":"Decent work and economic growth","score":0.46000000834465027}],"awards":[],"funders":[{"id":"https://openalex.org/F4320338304","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W1484736291","https://openalex.org/W1524327605","https://openalex.org/W1596277129","https://openalex.org/W1966829058","https://openalex.org/W1978082708","https://openalex.org/W2119541875","https://openalex.org/W2127433432","https://openalex.org/W2132397967","https://openalex.org/W2147598193","https://openalex.org/W2234482445","https://openalex.org/W2594536021","https://openalex.org/W2597144055","https://openalex.org/W2777672226","https://openalex.org/W2795980953","https://openalex.org/W2851021412","https://openalex.org/W2886481383","https://openalex.org/W6750567530","https://openalex.org/W6753359477"],"related_works":["https://openalex.org/W4319941049","https://openalex.org/W2092966558","https://openalex.org/W1956651153","https://openalex.org/W1984267569","https://openalex.org/W2347561926","https://openalex.org/W2315766899","https://openalex.org/W2327547880","https://openalex.org/W2418418119","https://openalex.org/W3153517983","https://openalex.org/W210339300"],"abstract_inverted_index":{"Evaluating":[0],"the":[1,41,94,154,159,162,168,204,224],"trade-off":[2],"space":[3],"between":[4],"performance":[5],"and":[6,21,54,61,80,84,105,114,127,151,214],"reliability":[7],"is":[8,72,89,146,165,215,227],"important":[9],"for":[10,148],"data":[11],"center":[12],"operators":[13],"as":[14,46],"part":[15],"of":[16,93,139,161,223],"their":[17],"supercomputer":[18],"procurement,":[19],"planning":[20],"acceptance":[22],"testing.":[23],"While":[24],"some":[25],"simple":[26],"systems":[27,135],"can":[28,208],"be":[29,209],"modeled":[30],"with":[31,39,136,153,230],"tractable":[32],"analytic":[33],"methods,":[34],"in":[35,200,218],"order":[36],"to":[37,65,129,157,172,198,202,211],"capture":[38],"fidelity":[40],"interaction":[42],"among":[43],"such":[44],"factors":[45],"individual":[47],"component":[48],"reliability,":[49],"processor":[50],"speeds,":[51],"checkpointing":[52],"behaviors":[53],"effects,":[55],"workload":[56,180,234],"characteristics":[57],"(capacity":[58],"versus":[59],"capability),":[60],"scheduling":[62,83],"policies,":[63],"just":[64],"name":[66],"a":[67,69,78,102,179,184,231],"few,":[68],"simulation-based":[70],"approach":[71],"required.":[73],"This":[74,207],"paper":[75],"extends":[76],"Batsim,":[77],"flexible":[79],"modular":[81],"batch":[82],"cluster":[85],"simulation":[86,96,225],"framework":[87],"that":[88,167,194,235],"built":[90],"on":[91,122,132,183],"top":[92],"grid":[95],"framework,":[97],"SimGrid.":[98],"These":[99],"extensions":[100],"add":[101],"fault":[103],"model":[104,145],"simulated":[106],"job":[107],"checkpoint-restart":[108],"capability.":[109],"The":[110,221],"enhancements":[111],"are":[112,116],"detailed":[113],"experiments":[115],"performed":[117],"using":[118],"this":[119,149],"new":[120],"capability":[121],"varying":[123,137],"workloads,":[124],"both":[125],"synthetic":[126],"real,":[128],"evaluate":[130,212],"impacts":[131],"prospective":[133,185],"HPC":[134],"levels":[138],"per-node":[140],"reliability.":[141],"A":[142],"basic":[143],"analytical":[144],"constructed":[147],"trade-off,":[150],"contrasted":[152],"experimental":[155],"results":[156],"illustrate":[158],"utility":[160],"simulator.":[163],"It":[164],"shown":[166],"toolkit":[169],"enables":[170],"one":[171],"see":[173],"not":[174],"only":[175],"how":[176,191],"much":[177,192],"worse":[178],"will":[181],"perform":[182],"less":[186],"reliable":[187],"system,":[188],"but":[189],"also":[190],"larger":[193],"system":[195,219],"would":[196],"have":[197],"increase":[199],"size":[201],"achieve":[203],"same":[205],"makespan.":[206],"used":[210],"tradeoffs":[213],"vitally":[216],"useful":[217],"procurement.":[220],"value":[222],"environment":[226],"further":[228],"demonstrated":[229],"complex,":[232],"real-world":[233],"performs":[236],"differently":[237],"than":[238],"expected":[239],"analytically.":[240]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
