{"id":"https://openalex.org/W2127934939","doi":"https://doi.org/10.1109/clustr.2009.5289185","title":"Cluster fault-tolerance: An experimental evaluation of checkpointing and MapReduce through simulation","display_name":"Cluster fault-tolerance: An experimental evaluation of checkpointing and MapReduce through simulation","publication_year":2009,"publication_date":"2009-01-01","ids":{"openalex":"https://openalex.org/W2127934939","doi":"https://doi.org/10.1109/clustr.2009.5289185","mag":"2127934939"},"language":"en","primary_location":{"id":"doi:10.1109/clustr.2009.5289185","is_oa":false,"landing_page_url":"https://doi.org/10.1109/clustr.2009.5289185","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2009 IEEE International Conference on Cluster Computing and Workshops","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5089369901","display_name":"Thomas Bressoud","orcid":null},"institutions":[{"id":"https://openalex.org/I20577493","display_name":"Denison University","ror":"https://ror.org/05pqx1c24","country_code":"US","type":"education","lineage":["https://openalex.org/I20577493"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Thomas C. Bressoud","raw_affiliation_strings":["Department of Mathematics and Computer Science, Denison University, Granville, OH, USA","Department of Mathematics and Computer Science, Denison University, P.O. Box 810, Granville, Ohio, USA"],"affiliations":[{"raw_affiliation_string":"Department of Mathematics and Computer Science, Denison University, Granville, OH, USA","institution_ids":["https://openalex.org/I20577493"]},{"raw_affiliation_string":"Department of Mathematics and Computer Science, Denison University, P.O. Box 810, Granville, Ohio, USA","institution_ids":["https://openalex.org/I20577493"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5068719520","display_name":"Michael A. Kozuch","orcid":"https://orcid.org/0009-0009-0939-3297"},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Michael A. Kozuch","raw_affiliation_strings":["Intel Research Pittsburgh, Intel Corporation, Pittsburgh, PA, USA","Intel Research Pittsburgh, Intel Corporation, 4720 Forbes Avenue, Pittsburgh, Pennsylvania, USA"],"affiliations":[{"raw_affiliation_string":"Intel Research Pittsburgh, Intel Corporation, Pittsburgh, PA, USA","institution_ids":["https://openalex.org/I1343180700"]},{"raw_affiliation_string":"Intel Research Pittsburgh, Intel Corporation, 4720 Forbes Avenue, Pittsburgh, Pennsylvania, USA","institution_ids":["https://openalex.org/I1343180700"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5089369901"],"corresponding_institution_ids":["https://openalex.org/I20577493"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":10,"citation_normalized_percentile":{"value":0.18019239,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":"39","issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10974","display_name":"Advanced Queuing Theory Analysis","score":0.9884999990463257,"subfield":{"id":"https://openalex.org/subfields/1404","display_name":"Management Information Systems"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8524273037910461},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.8226287364959717},{"id":"https://openalex.org/keywords/cluster","display_name":"Cluster (spacecraft)","score":0.6164684891700745},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.5996836423873901},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.5834828615188599},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5381587147712708},{"id":"https://openalex.org/keywords/computer-cluster","display_name":"Computer cluster","score":0.4542234539985657},{"id":"https://openalex.org/keywords/software-fault-tolerance","display_name":"Software fault tolerance","score":0.4327569305896759},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.4283345341682434},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.3048721253871918}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8524273037910461},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.8226287364959717},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.6164684891700745},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.5996836423873901},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5834828615188599},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5381587147712708},{"id":"https://openalex.org/C29140674","wikidata":"https://www.wikidata.org/wiki/Q206637","display_name":"Computer cluster","level":2,"score":0.4542234539985657},{"id":"https://openalex.org/C50712370","wikidata":"https://www.wikidata.org/wiki/Q4269346","display_name":"Software fault tolerance","level":3,"score":0.4327569305896759},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.4283345341682434},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.3048721253871918},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/clustr.2009.5289185","is_oa":false,"landing_page_url":"https://doi.org/10.1109/clustr.2009.5289185","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2009 IEEE International Conference on Cluster Computing and Workshops","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/1","score":0.47999998927116394,"display_name":"No poverty"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W1537929875","https://openalex.org/W1538116128","https://openalex.org/W1558516248","https://openalex.org/W1591141258","https://openalex.org/W1911403303","https://openalex.org/W1963759980","https://openalex.org/W1999667809","https://openalex.org/W2004885674","https://openalex.org/W2007397415","https://openalex.org/W2033656974","https://openalex.org/W2037730929","https://openalex.org/W2043522772","https://openalex.org/W2089536264","https://openalex.org/W2100418159","https://openalex.org/W2100830825","https://openalex.org/W2100970777","https://openalex.org/W2107263349","https://openalex.org/W2113052966","https://openalex.org/W2114967206","https://openalex.org/W2118750181","https://openalex.org/W2121098551","https://openalex.org/W2126177294","https://openalex.org/W2138509363","https://openalex.org/W2144984544","https://openalex.org/W2160821994","https://openalex.org/W2171453084","https://openalex.org/W2173213060","https://openalex.org/W2337969425","https://openalex.org/W4232281662","https://openalex.org/W6631987640","https://openalex.org/W6632055538","https://openalex.org/W6635224253","https://openalex.org/W6675073755","https://openalex.org/W6678306422","https://openalex.org/W6680456895","https://openalex.org/W6681525158"],"related_works":["https://openalex.org/W2971479921","https://openalex.org/W2525033434","https://openalex.org/W4280533024","https://openalex.org/W2106348006","https://openalex.org/W3145923041","https://openalex.org/W2946906624","https://openalex.org/W841176518","https://openalex.org/W1978919910","https://openalex.org/W4300992253","https://openalex.org/W2157727563"],"abstract_inverted_index":{"Traditionally,":[0],"cluster":[1,147],"computing":[2],"has":[3],"employed":[4],"checkpointing":[5,112],"to":[6,77,120],"address":[7],"fault":[8,32,126,155],"tolerance.":[9],"Recently,":[10],"new":[11],"models":[12,109],"for":[13,53,70,153],"parallel":[14,111,143],"applications":[15],"have":[16],"grown":[17],"in":[18,85,94,145,148],"popularity":[19],"namely":[20],"MapReduce":[21,114],"and":[22,107,113,123],"Dryad,":[23],"with":[24,36,132],"runtime":[25],"systems":[26,54,72],"providing":[27],"their":[28,40],"own":[29],"re-execute":[30],"based":[31],"tolerance":[33,127,156],"mechanisms,":[34],"but":[35,64],"no":[37],"analysis":[38],"of":[39,48,55,110,129,135,141,151],"failure":[41,49,68],"characteristics.":[42],"Another":[43],"development":[44],"is":[45,73],"the":[46,65,78,104,125,133,137,149],"availability":[47],"data":[50,106],"spanning":[51],"years":[52],"significant":[56],"size":[57],"at":[58],"Los":[59],"Alamos":[60],"National":[61],"Labs":[62],"(LANL),":[63],"time":[66,140],"between":[67],"(TBF)":[69],"these":[71,88,130],"a":[74,98,142,146],"poor":[75],"fit":[76],"exponential":[79],"distribution":[80],"assumed":[81],"by":[82,103,108],"optimization":[83],"work":[84,93],"checkpointing,":[86],"bringing":[87],"results":[89],"into":[90],"question.":[91],"The":[92,116],"this":[95],"paper":[96],"describes":[97],"discrete":[99],"event":[100],"simulation":[101,117],"driven":[102],"LANL":[105],"tasks.":[115],"allows":[118],"us":[119],"then":[121],"evaluate":[122],"assess":[124],"characteristics":[128],"tasks":[131],"goal":[134],"minimizing":[136],"expected":[138],"running":[139],"program":[144],"presence":[150],"faults":[152],"both":[154],"models.":[157]},"counts_by_year":[{"year":2016,"cited_by_count":4},{"year":2015,"cited_by_count":1},{"year":2014,"cited_by_count":3},{"year":2013,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
