{"id":"https://openalex.org/W4409132214","doi":"https://doi.org/10.1109/hpec62836.2024.10938435","title":"Exploring the Trade-Off Between Repair Time and Reliability in Large Scale Cluster Computers: A Simulation-Based Approach","display_name":"Exploring the Trade-Off Between Repair Time and Reliability in Large Scale Cluster Computers: A Simulation-Based Approach","publication_year":2024,"publication_date":"2024-09-23","ids":{"openalex":"https://openalex.org/W4409132214","doi":"https://doi.org/10.1109/hpec62836.2024.10938435"},"language":"en","primary_location":{"id":"doi:10.1109/hpec62836.2024.10938435","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpec62836.2024.10938435","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE High Performance Extreme Computing Conference (HPEC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5116498862","display_name":"Leslie A. Horace","orcid":"https://orcid.org/0000-0001-9176-6447"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Leslie A. Horace","raw_affiliation_strings":["College of Computing, Georgia Institute of Technology,Atlanta,GA,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"College of Computing, Georgia Institute of Technology,Atlanta,GA,USA","institution_ids":["https://openalex.org/I130701444"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102121113","display_name":"Craig S. Walker","orcid":null},"institutions":[{"id":"https://openalex.org/I208081647","display_name":"Coastal Carolina University","ror":"https://ror.org/01621q256","country_code":"US","type":"education","lineage":["https://openalex.org/I208081647"]},{"id":"https://openalex.org/I4210152127","display_name":"Conway School of Landscape Design","ror":"https://ror.org/04q7y8a54","country_code":"US","type":"education","lineage":["https://openalex.org/I4210152127"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Craig S. Walker","raw_affiliation_strings":["Coastal Carolina University,Department of Computing Sciences,Conway,SC,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Coastal Carolina University,Department of Computing Sciences,Conway,SC,USA","institution_ids":["https://openalex.org/I208081647","https://openalex.org/I4210152127"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062187807","display_name":"William M. Jones","orcid":"https://orcid.org/0000-0002-3796-6330"},"institutions":[{"id":"https://openalex.org/I208081647","display_name":"Coastal Carolina University","ror":"https://ror.org/01621q256","country_code":"US","type":"education","lineage":["https://openalex.org/I208081647"]},{"id":"https://openalex.org/I4210152127","display_name":"Conway School of Landscape Design","ror":"https://ror.org/04q7y8a54","country_code":"US","type":"education","lineage":["https://openalex.org/I4210152127"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"William M. Jones","raw_affiliation_strings":["Coastal Carolina University,Department of Computing Sciences,Conway,SC,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Coastal Carolina University,Department of Computing Sciences,Conway,SC,USA","institution_ids":["https://openalex.org/I208081647","https://openalex.org/I4210152127"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056569157","display_name":"Nathan DeBardeleben","orcid":"https://orcid.org/0000-0002-5593-9205"},"institutions":[{"id":"https://openalex.org/I1343871089","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1343871089","https://openalex.org/I198811213","https://openalex.org/I4210120050"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nathan A. DeBardeleben","raw_affiliation_strings":["Los Alamos National Laboratory,High Performance Computing Design,Los Alamos,NM,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Los Alamos National Laboratory,High Performance Computing Design,Los Alamos,NM,USA","institution_ids":["https://openalex.org/I1343871089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093314692","display_name":"Vivian E. Hafener","orcid":null},"institutions":[{"id":"https://openalex.org/I1343871089","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1343871089","https://openalex.org/I198811213","https://openalex.org/I4210120050"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Vivian E. Hafener","raw_affiliation_strings":["Los Alamos National Laboratory,High Performance Computing Env.,Los Alamos,NM,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Los Alamos National Laboratory,High Performance Computing Env.,Los Alamos,NM,USA","institution_ids":["https://openalex.org/I1343871089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5093314693","display_name":"Steven T. Senator","orcid":null},"institutions":[{"id":"https://openalex.org/I1343871089","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1343871089","https://openalex.org/I198811213","https://openalex.org/I4210120050"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Steven T. Senator","raw_affiliation_strings":["Los Alamos National Laboratory,High Performance Computing Env.,Los Alamos,NM,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Los Alamos National Laboratory,High Performance Computing Env.,Los Alamos,NM,USA","institution_ids":["https://openalex.org/I1343871089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.43515858,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"9"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10974","display_name":"Advanced Queuing Theory Analysis","score":0.9847999811172485,"subfield":{"id":"https://openalex.org/subfields/1404","display_name":"Management Information Systems"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10974","display_name":"Advanced Queuing Theory Analysis","score":0.9847999811172485,"subfield":{"id":"https://openalex.org/subfields/1404","display_name":"Management Information Systems"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10551","display_name":"Scheduling and Optimization Algorithms","score":0.9747999906539917,"subfield":{"id":"https://openalex.org/subfields/2209","display_name":"Industrial and Manufacturing Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9520999789237976,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7351550459861755},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.7013880014419556},{"id":"https://openalex.org/keywords/cluster","display_name":"Cluster (spacecraft)","score":0.6342121958732605},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.5607558488845825},{"id":"https://openalex.org/keywords/reliability-engineering","display_name":"Reliability engineering","score":0.4415791928768158},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.4285748302936554},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.32793793082237244},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.21146178245544434},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.10869532823562622}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7351550459861755},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.7013880014419556},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.6342121958732605},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.5607558488845825},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.4415791928768158},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4285748302936554},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.32793793082237244},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.21146178245544434},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.10869532823562622},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpec62836.2024.10938435","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpec62836.2024.10938435","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE High Performance Extreme Computing Conference (HPEC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G7500863821","display_name":null,"funder_award_id":"89233218CNA000001","funder_id":"https://openalex.org/F4320338304","funder_display_name":"Los Alamos National Laboratory"}],"funders":[{"id":"https://openalex.org/F4320338304","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":15,"referenced_works":["https://openalex.org/W1944778298","https://openalex.org/W1966829058","https://openalex.org/W1978082708","https://openalex.org/W2119541875","https://openalex.org/W2132397967","https://openalex.org/W2133046454","https://openalex.org/W2234482445","https://openalex.org/W2594536021","https://openalex.org/W2597144055","https://openalex.org/W2777672226","https://openalex.org/W2851021412","https://openalex.org/W4200160368","https://openalex.org/W4388893987","https://openalex.org/W6750567530","https://openalex.org/W6844925538"],"related_works":["https://openalex.org/W2033512842","https://openalex.org/W4233600955","https://openalex.org/W4322734194","https://openalex.org/W3116237489","https://openalex.org/W4404996554","https://openalex.org/W2913665393","https://openalex.org/W2369695847","https://openalex.org/W3005535424","https://openalex.org/W2994319598","https://openalex.org/W2047067935"],"abstract_inverted_index":{"As":[0],"the":[1,19,25,63,76,113,161],"size":[2],"of":[3,39,131,160],"high":[4,87],"performance":[5,59,80,88],"computing":[6,89],"(HPC)":[7,90],"computational":[8,91],"clusters":[9,92],"continues":[10],"to":[11,109,118,142,149],"increase":[12],"in":[13,32,85,125],"performance,":[14],"scale":[15],"and":[16,23,36,56,60,81,155,164],"component":[17],"count,":[18],"role":[20,31],"that":[21,136,145],"reliability":[22],"particularly":[24],"repair":[26,83],"time":[27],"plays":[28],"a":[29,46,158],"significant":[30],"system":[33,58,153],"specification,":[34],"procurement,":[35],"ultimate":[37],"operation":[38,156],"such":[40],"systems.":[41],"System":[42],"administrators":[43],"must":[44],"find":[45],"balance":[47],"among":[48,69],"competing":[49],"factors:":[50],"initial":[51],"capital":[52],"investment,":[53],"operational":[54,162],"costs":[55],"observed":[57],"utility":[61],"from":[62,97],"end":[64],"users'":[65],"perspectives":[66],"are":[67],"chief":[68],"them.":[70],"In":[71],"this":[72],"paper,":[73],"we":[74],"explore":[75],"tradeoff":[77],"between":[78],"reliability,":[79],"node":[82],"times":[84],"large-scale":[86,114],"using":[93],"real":[94],"historical":[95],"workloads":[96],"Los":[98],"Alamos":[99],"National":[100],"Laboratory":[101],"(LANL).":[102],"We":[103],"enhance":[104],"an":[105],"existing":[106],"cluster":[107],"simulator":[108],"more":[110],"quickly":[111],"perform":[112],"parameter":[115],"sweeps":[116],"necessary":[117],"obtain":[119],"meaningful":[120],"results":[121,134],"for":[122],"these":[123,137],"studies,":[124],"some":[126],"cases":[127],"by":[128],"several":[129],"orders":[130],"magnitude.":[132],"Our":[133],"show":[135],"simulations":[138],"can":[139,146],"be":[140,147],"parameterized":[141],"identify":[143],"trends":[144],"used":[148],"make":[150],"decisions":[151],"about":[152],"procurement":[154],"as":[157],"function":[159],"parameters":[163],"constraints.":[165]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
