{"id":"https://openalex.org/W2898407160","doi":"https://doi.org/10.1109/asonam.2018.8508245","title":"Introducing a Reliability Analysis Framework for High Performance Computing Environments","display_name":"Introducing a Reliability Analysis Framework for High Performance Computing Environments","publication_year":2018,"publication_date":"2018-08-01","ids":{"openalex":"https://openalex.org/W2898407160","doi":"https://doi.org/10.1109/asonam.2018.8508245","mag":"2898407160"},"language":"en","primary_location":{"id":"doi:10.1109/asonam.2018.8508245","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asonam.2018.8508245","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2018 IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining (ASONAM)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5000127560","display_name":"Shivani Sharma","orcid":"https://orcid.org/0000-0003-3381-269X"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"S. Sharma","raw_affiliation_strings":["Whiting School of Engineering, Johns Hopkins University, Baltimore, Maryland"],"affiliations":[{"raw_affiliation_string":"Whiting School of Engineering, Johns Hopkins University, Baltimore, Maryland","institution_ids":["https://openalex.org/I145311948"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5024997350","display_name":"A. D. Clark","orcid":null},"institutions":[{"id":"https://openalex.org/I12097938","display_name":"West Virginia University","ror":"https://ror.org/011vxgd24","country_code":"US","type":"education","lineage":["https://openalex.org/I12097938"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"A. D. Clark","raw_affiliation_strings":["Lane Department of Computer Science and Electrical Engineering, West Virginia University, Morgantown, West Virginia"],"affiliations":[{"raw_affiliation_string":"Lane Department of Computer Science and Electrical Engineering, West Virginia University, Morgantown, West Virginia","institution_ids":["https://openalex.org/I12097938"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5000127560"],"corresponding_institution_ids":["https://openalex.org/I145311948"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.14314308,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":95},"biblio":{"volume":"67","issue":null,"first_page":"1131","last_page":"1138"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9958000183105469,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7830967903137207},{"id":"https://openalex.org/keywords/troubleshooting","display_name":"Troubleshooting","score":0.7003649473190308},{"id":"https://openalex.org/keywords/mean-time-between-failures","display_name":"Mean time between failures","score":0.579458475112915},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.5604472160339355},{"id":"https://openalex.org/keywords/reliability-engineering","display_name":"Reliability engineering","score":0.5445104837417603},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.4546542167663574},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.42081180214881897},{"id":"https://openalex.org/keywords/failure-rate","display_name":"Failure rate","score":0.33731013536453247},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.22062358260154724},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.12043046951293945},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.10507053136825562}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7830967903137207},{"id":"https://openalex.org/C147494362","wikidata":"https://www.wikidata.org/wiki/Q2078905","display_name":"Troubleshooting","level":2,"score":0.7003649473190308},{"id":"https://openalex.org/C44154001","wikidata":"https://www.wikidata.org/wiki/Q754940","display_name":"Mean time between failures","level":3,"score":0.579458475112915},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.5604472160339355},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.5445104837417603},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.4546542167663574},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.42081180214881897},{"id":"https://openalex.org/C163164238","wikidata":"https://www.wikidata.org/wiki/Q2737027","display_name":"Failure rate","level":2,"score":0.33731013536453247},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.22062358260154724},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.12043046951293945},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.10507053136825562},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asonam.2018.8508245","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asonam.2018.8508245","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2018 IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining (ASONAM)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.6299999952316284,"display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":13,"referenced_works":["https://openalex.org/W1999667809","https://openalex.org/W2057218453","https://openalex.org/W2089536264","https://openalex.org/W2126637194","https://openalex.org/W2138128578","https://openalex.org/W2138509363","https://openalex.org/W2142812297","https://openalex.org/W2144984544","https://openalex.org/W2251285835","https://openalex.org/W2784123256","https://openalex.org/W4254367612","https://openalex.org/W6680456895","https://openalex.org/W6681525158"],"related_works":["https://openalex.org/W2183751629","https://openalex.org/W2168011386","https://openalex.org/W2122592404","https://openalex.org/W4244614293","https://openalex.org/W2315243270","https://openalex.org/W2337334590","https://openalex.org/W1030923862","https://openalex.org/W2295465696","https://openalex.org/W2248481300","https://openalex.org/W3029118220"],"abstract_inverted_index":{"Supercomputing":[0],"environments":[1,50],"are":[2,54,112],"becoming":[3],"the":[4,52,73,106,132,136,150,166,171,180,190,207,213,236],"norm":[5],"for":[6,45,90],"daily":[7],"use.":[8],"However,":[9],"their":[10],"complex":[11],"infrastructure":[12],"makes":[13],"troubleshooting":[14,224],"and":[15,32,80,84,92,124,200,223,241,247],"monitoring":[16],"failures":[17],"extremely":[18],"difficult.":[19],"This":[20,70,95,154],"is":[21,64,108,128,155,187],"because":[22],"these":[23,36],"infrastructures":[24],"contain":[25],"thousands":[26],"of":[27,114,197,215,220,229],"nodes":[28],"representing":[29],"various":[30],"applications":[31],"processors.":[33],"To":[34],"address":[35],"concerns,":[37],"we":[38],"propose":[39],"a":[40,67,121,158,184],"real-time":[41],"reliability":[42,122,152],"analysis":[43,125,186,205],"framework":[44],"high":[46],"performance":[47],"computing":[48],"(HPC)":[49],"where":[51,105],"contributions":[53],"three-fold.":[55],"First,":[56],"an":[57],"improved":[58],"data":[59],"network":[60],"extrapolation":[61],"(DNE)":[62],"methodology":[63],"proposed":[65,129],"as":[66,170,177,179],"pre-processing":[68],"module.":[69],"component":[71],"incorporates":[72],"system":[74],"failure":[75,88,103,133,159,167,175,202,225,243],"information":[76,134],"(i.e.":[77],"job,":[78],"fault,":[79],"error":[81],"log":[82],"files)":[83],"performs":[85,98],"robust":[86],"job-based":[87],"accounting":[89],"sequential":[91],"parallel":[93],"jobs.":[94,119],"element":[96],"also":[97],"cross-referencing":[99],"to":[100,139,149,174,239],"compute":[101],"task-based":[102,201],"accounting,":[104],"assumption":[107],"made":[109,188],"that":[110,130,164],"tasks":[111],"comprised":[113],"either":[115],"one":[116],"or":[117,192],"more":[118],"Next,":[120],"characterization":[123,161],"(RCA)":[126],"schema":[127,163],"takes":[131],"from":[135],"DNE":[137],"process":[138],"perform":[140],"survival":[141],"analyses":[142],"on":[143],"each":[144],"individual":[145],"node":[146],"in":[147,195,218],"addition":[148],"entire":[151],"infrastructure.":[153],"coupled":[156],"with":[157],"metrics":[160,168],"(FMC)":[162],"estimates":[165],"such":[169],"mean":[172],"time":[173],"(MTTF)":[176],"well":[178],"hazard":[181],"rate.":[182],"Additionally,":[183],"comparative":[185],"between":[189],"Log-Normal":[191],"Weibull":[193],"distributions":[194],"terms":[196,219],"modeling":[198],"job":[199],"activity.":[203],"Empirical":[204],"using":[206],"Structural":[208],"Simulation":[209],"Toolkit":[210],"(SST)":[211],"illustrate":[212],"promise":[214],"this":[216,230],"approach":[217],"characterizing,":[221],"monitoring,":[222],"behavior.":[226],"The":[227],"results":[228],"work":[231],"can":[232],"aide":[233],"systems":[234],"administrators":[235],"dynamic":[237],"tools":[238],"pinpoint":[240],"monitor":[242],"behavior;":[244],"its":[245],"impacts;":[246],"alternative":[248],"job-scheduling":[249],"policies":[250],"without":[251],"interrupting":[252],"production":[253],"processes.":[254]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2022,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
