{"id":"https://openalex.org/W1984980195","doi":"https://doi.org/10.1109/grid.2010.5697961","title":"Analysis and modeling of time-correlated failures in large-scale distributed systems","display_name":"Analysis and modeling of time-correlated failures in large-scale distributed systems","publication_year":2010,"publication_date":"2010-10-01","ids":{"openalex":"https://openalex.org/W1984980195","doi":"https://doi.org/10.1109/grid.2010.5697961","mag":"1984980195"},"language":"en","primary_location":{"id":"doi:10.1109/grid.2010.5697961","is_oa":false,"landing_page_url":"https://doi.org/10.1109/grid.2010.5697961","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2010 11th IEEE/ACM International Conference on Grid Computing","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://pure.tue.nl/ws/files/3736277/Metis255876.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5031850480","display_name":"Nezih Yigitbasi","orcid":null},"institutions":[{"id":"https://openalex.org/I98358874","display_name":"Delft University of Technology","ror":"https://ror.org/02e2c7k09","country_code":"NL","type":"education","lineage":["https://openalex.org/I98358874"]}],"countries":["NL"],"is_corresponding":true,"raw_author_name":"Nezih Yigitbasi","raw_affiliation_strings":["Delft University of Technnology, Netherlands","TU Delft - Delft University of Technology (Postbus 5,  2600 AA Delft - Netherlands)"],"affiliations":[{"raw_affiliation_string":"Delft University of Technnology, Netherlands","institution_ids":["https://openalex.org/I98358874"]},{"raw_affiliation_string":"TU Delft - Delft University of Technology (Postbus 5,  2600 AA Delft - Netherlands)","institution_ids":["https://openalex.org/I98358874"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079805544","display_name":"Matthieu Gallet","orcid":null},"institutions":[{"id":"https://openalex.org/I113428412","display_name":"\u00c9cole Normale Sup\u00e9rieure de Lyon","ror":"https://ror.org/04zmssz18","country_code":"FR","type":"education","lineage":["https://openalex.org/I113428412","https://openalex.org/I203339264"]},{"id":"https://openalex.org/I4210144566","display_name":"Laboratoire de l'Informatique du Parall\u00e9lisme","ror":"https://ror.org/04msnz457","country_code":"FR","type":"facility","lineage":["https://openalex.org/I100532134","https://openalex.org/I113428412","https://openalex.org/I1294671590","https://openalex.org/I1326498283","https://openalex.org/I203339264","https://openalex.org/I203339264","https://openalex.org/I4210144566"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Matthieu Gallet","raw_affiliation_strings":["Ecole Normale Superieure, France","LIP - Laboratoire de l'Informatique du Parall\u00e9lisme (46 All\u00e9e d'Italie 69364 LYON CEDEX 07 - France)","GRAAL - Algorithms and Scheduling for Distributed Heterogeneous Platforms (ENS Lyon, 46 all\u00e9e d'Italie, 69364 LYON cedex 07 France - France)"],"affiliations":[{"raw_affiliation_string":"Ecole Normale Superieure, France","institution_ids":[]},{"raw_affiliation_string":"LIP - Laboratoire de l'Informatique du Parall\u00e9lisme (46 All\u00e9e d'Italie 69364 LYON CEDEX 07 - France)","institution_ids":["https://openalex.org/I4210144566","https://openalex.org/I113428412"]},{"raw_affiliation_string":"GRAAL - Algorithms and Scheduling for Distributed Heterogeneous Platforms (ENS Lyon, 46 all\u00e9e d'Italie, 69364 LYON cedex 07 France - France)","institution_ids":["https://openalex.org/I113428412"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087185379","display_name":"Derrick Kondo","orcid":null},"institutions":[{"id":"https://openalex.org/I1326498283","display_name":"Institut national de recherche en sciences et technologies du num\u00e9rique","ror":"https://ror.org/02kvxyf05","country_code":"FR","type":"government","lineage":["https://openalex.org/I1326498283"]},{"id":"https://openalex.org/I4210101348","display_name":"Centre Inria de l'Universit\u00e9 Grenoble Alpes","ror":"https://ror.org/00n8d6z93","country_code":"FR","type":"facility","lineage":["https://openalex.org/I1326498283","https://openalex.org/I4210101348"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Derrick Kondo","raw_affiliation_strings":["INRIA Grenoble, France","MESCAL - Middleware efficiently scalable (Inria Grenoble - Rh\u00f4ne-Alpes 655 avenue de l'Europe - Montbonnot 38334 Saint Ismier Cedex - France)"],"affiliations":[{"raw_affiliation_string":"INRIA Grenoble, France","institution_ids":["https://openalex.org/I1326498283"]},{"raw_affiliation_string":"MESCAL - Middleware efficiently scalable (Inria Grenoble - Rh\u00f4ne-Alpes 655 avenue de l'Europe - Montbonnot 38334 Saint Ismier Cedex - France)","institution_ids":["https://openalex.org/I4210101348"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006986556","display_name":"Alexandru Iosup","orcid":"https://orcid.org/0000-0001-8030-9398"},"institutions":[{"id":"https://openalex.org/I98358874","display_name":"Delft University of Technology","ror":"https://ror.org/02e2c7k09","country_code":"NL","type":"education","lineage":["https://openalex.org/I98358874"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Alexandru Iosup","raw_affiliation_strings":["Delft University of Technnology, Netherlands","PDS - Parallel and Distributed Group (TU Delft, Room HB 09.090 Mekelweg 4, 2628 CD, Delft The Netherlands - Netherlands)","TU Delft - Delft University of Technology (Postbus 5,  2600 AA Delft - Netherlands)"],"affiliations":[{"raw_affiliation_string":"Delft University of Technnology, Netherlands","institution_ids":["https://openalex.org/I98358874"]},{"raw_affiliation_string":"PDS - Parallel and Distributed Group (TU Delft, Room HB 09.090 Mekelweg 4, 2628 CD, Delft The Netherlands - Netherlands)","institution_ids":["https://openalex.org/I98358874"]},{"raw_affiliation_string":"TU Delft - Delft University of Technology (Postbus 5,  2600 AA Delft - Netherlands)","institution_ids":["https://openalex.org/I98358874"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5049227990","display_name":"Dick Epema","orcid":"https://orcid.org/0000-0002-1015-0075"},"institutions":[{"id":"https://openalex.org/I98358874","display_name":"Delft University of Technology","ror":"https://ror.org/02e2c7k09","country_code":"NL","type":"education","lineage":["https://openalex.org/I98358874"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Dick Epema","raw_affiliation_strings":["Delft University of Technnology, Netherlands","PDS - Parallel and Distributed Group (TU Delft, Room HB 09.090 Mekelweg 4, 2628 CD, Delft The Netherlands - Netherlands)"],"affiliations":[{"raw_affiliation_string":"Delft University of Technnology, Netherlands","institution_ids":["https://openalex.org/I98358874"]},{"raw_affiliation_string":"PDS - Parallel and Distributed Group (TU Delft, Room HB 09.090 Mekelweg 4, 2628 CD, Delft The Netherlands - Netherlands)","institution_ids":["https://openalex.org/I98358874"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5031850480"],"corresponding_institution_ids":["https://openalex.org/I98358874"],"apc_list":null,"apc_paid":null,"fwci":6.0279,"has_fulltext":true,"cited_by_count":65,"citation_normalized_percentile":{"value":0.96399729,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"65","last_page":"72"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.655515193939209},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.5786118507385254},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.4905921518802643},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.0631178617477417}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.655515193939209},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.5786118507385254},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4905921518802643},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0631178617477417},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.0}],"mesh":[],"locations_count":6,"locations":[{"id":"doi:10.1109/grid.2010.5697961","is_oa":false,"landing_page_url":"https://doi.org/10.1109/grid.2010.5697961","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2010 11th IEEE/ACM International Conference on Grid Computing","raw_type":"proceedings-article"},{"id":"pmh:oai:pure.tue.nl:publications/8d3afbfa-0170-4802-9e54-508d81059469","is_oa":true,"landing_page_url":"https://research.tue.nl/en/publications/8d3afbfa-0170-4802-9e54-508d81059469","pdf_url":"https://pure.tue.nl/ws/files/3736277/Metis255876.pdf","source":{"id":"https://openalex.org/S4406922641","display_name":"TU/e Research Portal","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Yigitbasi, M N, Gallet, M, Kondo, D, Iosup, A & Epema, D H J 2010, Analysis and modeling of time-correlated failures in large-scale distributed systems. in Proceedings of the 11th IEEE/ACM International Conference on Grid Computing (GRID 2010, Brussels, Belgium, October 25-28, 2011). Institute of Electrical and Electronics Engineers, pp. 65-72. https://doi.org/10.1109/GRID.2010.5697961","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:HAL:hal-00786266v1","is_oa":false,"landing_page_url":"https://inria.hal.science/hal-00786266","pdf_url":null,"source":{"id":"https://openalex.org/S4306402512","display_name":"HAL (Le Centre pour la Communication Scientifique Directe)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1294671590","host_organization_name":"Centre National de la Recherche Scientifique","host_organization_lineage":["https://openalex.org/I1294671590"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Grid 2010 - Proceedings of the 11th ACM/IEEE International Conference on Grid Computing, 2010, Bruxelles, Belgium. pp.355-366, &#x27E8;10.1109/GRID.2010.5697961&#x27E9;","raw_type":"Conference papers"},{"id":"pmh:oai:research.vu.nl:publications/2591ee13-edd4-4287-83c8-acbca4489476","is_oa":false,"landing_page_url":"https://hdl.handle.net/1871.1/2591ee13-edd4-4287-83c8-acbca4489476","pdf_url":null,"source":{"id":"https://openalex.org/S4306401107","display_name":"VU Research Portal","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I865915315","host_organization_name":"Vrije Universiteit Amsterdam","host_organization_lineage":["https://openalex.org/I865915315"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Yigitbasi, N, Gallet, M, Kondo, D, Iosup, A & Epema, D H J 2010, Analysis and modeling of time-correlated failures in large-scale distributed systems. in Proceedings of the 2010 11th IEEE/ACM International Conference on Grid Computing, Brussels, Belgium, October 25-29, 2010. pp. 65-72. https://doi.org/10.1109/GRID.2010.5697961","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:tue:oai:pure.tue.nl:publications/8d3afbfa-0170-4802-9e54-508d81059469","is_oa":true,"landing_page_url":"https://research.tue.nl/nl/publications/8d3afbfa-0170-4802-9e54-508d81059469","pdf_url":null,"source":{"id":"https://openalex.org/S4306401843","display_name":"Data Archiving and Networked Services (DANS)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1322597698","host_organization_name":"Royal Netherlands Academy of Arts and Sciences","host_organization_lineage":["https://openalex.org/I1322597698"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Proceedings of the 11th IEEE/ACM International Conference on Grid Computing (GRID 2010, Brussels, Belgium, October 25-28, 2011), 65 - 72","raw_type":"info:eu-repo/semantics/conferencepaper"},{"id":"pmh:vu:oai:research.vu.nl:publications/2591ee13-edd4-4287-83c8-acbca4489476","is_oa":false,"landing_page_url":"https://research.vu.nl/en/publications/2591ee13-edd4-4287-83c8-acbca4489476","pdf_url":null,"source":{"id":"https://openalex.org/S4306401843","display_name":"Data Archiving and Networked Services (DANS)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1322597698","host_organization_name":"Royal Netherlands Academy of Arts and Sciences","host_organization_lineage":["https://openalex.org/I1322597698"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Proceedings of the 2010 11th IEEE/ACM International Conference on Grid Computing, Brussels, Belgium, October 25-29, 2010, 65 - 72","raw_type":"info:eu-repo/semantics/conferencepaper"}],"best_oa_location":{"id":"pmh:oai:pure.tue.nl:publications/8d3afbfa-0170-4802-9e54-508d81059469","is_oa":true,"landing_page_url":"https://research.tue.nl/en/publications/8d3afbfa-0170-4802-9e54-508d81059469","pdf_url":"https://pure.tue.nl/ws/files/3736277/Metis255876.pdf","source":{"id":"https://openalex.org/S4406922641","display_name":"TU/e Research Portal","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Yigitbasi, M N, Gallet, M, Kondo, D, Iosup, A & Epema, D H J 2010, Analysis and modeling of time-correlated failures in large-scale distributed systems. in Proceedings of the 11th IEEE/ACM International Conference on Grid Computing (GRID 2010, Brussels, Belgium, October 25-28, 2011). Institute of Electrical and Electronics Engineers, pp. 65-72. https://doi.org/10.1109/GRID.2010.5697961","raw_type":"info:eu-repo/semantics/publishedVersion"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W1984980195.pdf","grobid_xml":"https://content.openalex.org/works/W1984980195.grobid-xml"},"referenced_works_count":29,"referenced_works":["https://openalex.org/W1527235902","https://openalex.org/W1558516248","https://openalex.org/W1591141258","https://openalex.org/W1945970356","https://openalex.org/W1984980195","https://openalex.org/W2006075444","https://openalex.org/W2061048015","https://openalex.org/W2068171717","https://openalex.org/W2105726509","https://openalex.org/W2121098551","https://openalex.org/W2131629153","https://openalex.org/W2138509363","https://openalex.org/W2147176980","https://openalex.org/W2147388619","https://openalex.org/W2160225915","https://openalex.org/W2160821994","https://openalex.org/W2161234420","https://openalex.org/W2164136293","https://openalex.org/W2164463086","https://openalex.org/W3102490463","https://openalex.org/W4300450738","https://openalex.org/W6635224253","https://openalex.org/W6638070272","https://openalex.org/W6640538018","https://openalex.org/W6678306422","https://openalex.org/W6680456895","https://openalex.org/W6681675132","https://openalex.org/W6683682350","https://openalex.org/W6683935884"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"The":[0],"analysis":[1],"and":[2,73,110,124,158,169,181,221,243,262,276],"modeling":[3],"of":[4,42,57,82,89,107,117,129,167,173,207,216,224,239,247,280,283],"the":[5,19,39,45,55,80,87,98,105,115,126,164,174,192,205,209,213,219,222,232,245,248,257,265,281],"failures":[6,34,48,61,99,130,217,225,266],"bound":[7],"to":[8,22,278],"occur":[9],"in":[10,17,131,197],"today's":[11],"large-scale":[12,132,147,199],"production":[13,146],"systems":[14,25],"is":[15,94,137],"invaluable":[16],"providing":[18],"understanding":[20],"needed":[21],"make":[23],"these":[24,284],"fault-tolerant":[26],"yet":[27],"efficient.":[28],"Many":[29],"previous":[30],"studies":[31],"have":[32,75],"modeled":[33],"without":[35],"taking":[36],"into":[37],"account":[38],"time-varying":[40,127],"behavior":[41,128],"failures,":[43,168],"under":[44],"assumption":[46,72],"that":[47,171,189,264],"are":[49,100,269],"identically,":[50],"but":[51],"independently":[52],"distributed.":[53],"However,":[54],"presence":[56],"time":[58,165,215],"correlations":[59],"between":[60],"(such":[62],"as":[63],"peak":[64,193,210],"periods":[65,195],"with":[66],"increased":[67],"failure":[68,141,194,260],"rate)":[69],"refutes":[70],"this":[71,120],"can":[74],"a":[76,90,187,237],"significant":[77],"impact":[78],"on":[79,114,139,191,271],"effectiveness":[81],"fault-tolerance":[83,92],"mechanisms.":[84],"For":[85],"example,":[86],"performance":[88,106],"proactive":[91],"mechanism":[93],"more":[95],"effective":[96],"if":[97],"periodic":[101],"or":[102],"predictable;":[103],"similarly,":[104],"checkpointing,":[108],"redundancy,":[109],"scheduling":[111],"solutions":[112],"depends":[113],"frequency":[116],"failures.":[118],"In":[119],"study":[121,136],"we":[122,185,228,252],"analyze":[123],"model":[125,188,203,255],"distributed":[133,148,200],"systems.":[134,201,285],"Our":[135,202],"based":[138],"nineteen":[140,258],"traces":[142,176],"obtained":[143],"from":[144,236],"(mostly)":[145],"systems,":[149,153],"including":[150],"grids,":[151],"P2P":[152],"DNS":[154],"servers,":[155,157],"web":[156],"desktop":[159],"grids.":[160],"We":[161],"first":[162],"investigate":[163],"correlation":[166],"find":[170,263],"many":[172],"studied":[175],"exhibit":[177],"strong":[178],"daily":[179],"patterns":[180],"high":[182],"autocorrelation.":[183],"Then,":[184],"derive":[186],"focuses":[190],"occurring":[196],"real":[198,259],"characterizes":[204,268],"duration":[206,223],"peaks,":[208,220],"inter-arrival":[211,214],"time,":[212],"during":[218,226],"peaks;":[227],"determine":[229],"for":[230,273],"each":[231],"best-fitting":[233],"probability":[234],"distribution":[235],"set":[238],"several":[240],"candidate":[241],"distributions,":[242],"present":[244],"parameters":[246],"(best)":[249],"fit.":[250],"Last,":[251],"validate":[253],"our":[254],"against":[256],"traces,":[261],"it":[267],"responsible":[270],"average":[272],"over":[274],"50%":[275],"up":[277],"95%":[279],"downtime":[282]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":3},{"year":2021,"cited_by_count":3},{"year":2020,"cited_by_count":6},{"year":2018,"cited_by_count":4},{"year":2017,"cited_by_count":4},{"year":2016,"cited_by_count":7},{"year":2015,"cited_by_count":6},{"year":2014,"cited_by_count":6},{"year":2013,"cited_by_count":4},{"year":2012,"cited_by_count":7}],"updated_date":"2026-03-24T08:02:53.985720","created_date":"2025-10-10T00:00:00"}
