{"id":"https://openalex.org/W2132914442","doi":"https://doi.org/10.1109/ipdps.2004.1303239","title":"System-level fault-tolerance in large-scale parallel machines with buffered coscheduling","display_name":"System-level fault-tolerance in large-scale parallel machines with buffered coscheduling","publication_year":2004,"publication_date":"2004-06-10","ids":{"openalex":"https://openalex.org/W2132914442","doi":"https://doi.org/10.1109/ipdps.2004.1303239","mag":"2132914442"},"language":"en","primary_location":{"id":"doi:10.1109/ipdps.2004.1303239","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ipdps.2004.1303239","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"18th International Parallel and Distributed Processing Symposium, 2004. Proceedings.","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://www.osti.gov/servlets/purl/977449","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5066195314","display_name":"Fabrizio Petrini","orcid":"https://orcid.org/0000-0002-4977-7107"},"institutions":[{"id":"https://openalex.org/I1343871089","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1343871089","https://openalex.org/I198811213","https://openalex.org/I4210120050"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"F. Petrini","raw_affiliation_strings":["Performance and Architecture Laboratory (PAL), Computer and Computational Sciences (CCS) Division, Los Alamos National Laboratory, NM, USA","Performance & Archit. Lab., Los Alamos Nat. Lab., NM, USA"],"affiliations":[{"raw_affiliation_string":"Performance and Architecture Laboratory (PAL), Computer and Computational Sciences (CCS) Division, Los Alamos National Laboratory, NM, USA","institution_ids":["https://openalex.org/I1343871089"]},{"raw_affiliation_string":"Performance & Archit. Lab., Los Alamos Nat. Lab., NM, USA","institution_ids":["https://openalex.org/I1343871089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046929640","display_name":"Kei Davis","orcid":"https://orcid.org/0000-0002-4134-1798"},"institutions":[{"id":"https://openalex.org/I1343871089","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1343871089","https://openalex.org/I198811213","https://openalex.org/I4210120050"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"K. Davis","raw_affiliation_strings":["Performance and Architecture Laboratory (PAL), Computer and Computational Sciences (CCS) Division, Los Alamos National Laboratory, NM, USA","Performance & Archit. Lab., Los Alamos Nat. Lab., NM, USA"],"affiliations":[{"raw_affiliation_string":"Performance and Architecture Laboratory (PAL), Computer and Computational Sciences (CCS) Division, Los Alamos National Laboratory, NM, USA","institution_ids":["https://openalex.org/I1343871089"]},{"raw_affiliation_string":"Performance & Archit. Lab., Los Alamos Nat. Lab., NM, USA","institution_ids":["https://openalex.org/I1343871089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5008026936","display_name":"Jos\u00e9 Carlos Sancho","orcid":"https://orcid.org/0000-0002-6917-9155"},"institutions":[{"id":"https://openalex.org/I1343871089","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1343871089","https://openalex.org/I198811213","https://openalex.org/I4210120050"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"J.C. Sancho","raw_affiliation_strings":["Performance and Architecture Laboratory (PAL), Computer and Computational Sciences (CCS) Division, Los Alamos National Laboratory, NM, USA","Performance & Archit. Lab., Los Alamos Nat. Lab., NM, USA"],"affiliations":[{"raw_affiliation_string":"Performance and Architecture Laboratory (PAL), Computer and Computational Sciences (CCS) Division, Los Alamos National Laboratory, NM, USA","institution_ids":["https://openalex.org/I1343871089"]},{"raw_affiliation_string":"Performance & Archit. Lab., Los Alamos Nat. Lab., NM, USA","institution_ids":["https://openalex.org/I1343871089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5066195314"],"corresponding_institution_ids":["https://openalex.org/I1343871089"],"apc_list":null,"apc_paid":null,"fwci":1.9943,"has_fulltext":true,"cited_by_count":32,"citation_normalized_percentile":{"value":0.89093242,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"209","last_page":"216"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8687095642089844},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7422157526016235},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.7301880121231079},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.6929537057876587},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.5067630410194397},{"id":"https://openalex.org/keywords/transparency","display_name":"Transparency (behavior)","score":0.5050187706947327},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.46299490332603455},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.4347110986709595},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.3589523732662201},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.35580113530158997},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.10855570435523987}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8687095642089844},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7422157526016235},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.7301880121231079},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.6929537057876587},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5067630410194397},{"id":"https://openalex.org/C2780233690","wikidata":"https://www.wikidata.org/wiki/Q535347","display_name":"Transparency (behavior)","level":2,"score":0.5050187706947327},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.46299490332603455},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.4347110986709595},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3589523732662201},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.35580113530158997},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.10855570435523987},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/ipdps.2004.1303239","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ipdps.2004.1303239","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"18th International Parallel and Distributed Processing Symposium, 2004. Proceedings.","raw_type":"proceedings-article"},{"id":"pmh:oai:osti.gov:977449","is_oa":true,"landing_page_url":"https://www.osti.gov/biblio/977449","pdf_url":"https://www.osti.gov/servlets/purl/977449","source":{"id":"https://openalex.org/S4306402487","display_name":"OSTI OAI (U.S. Department of Energy Office of Scientific and Technical Information)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I139351228","host_organization_name":"Office of Scientific and Technical Information","host_organization_lineage":["https://openalex.org/I139351228"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":null},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.138.8","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.138.8","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://hpc.pnl.gov/people/fabrizio/papers/ftpds04.pdf","raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:osti.gov:977449","is_oa":true,"landing_page_url":"https://www.osti.gov/biblio/977449","pdf_url":"https://www.osti.gov/servlets/purl/977449","source":{"id":"https://openalex.org/S4306402487","display_name":"OSTI OAI (U.S. Department of Energy Office of Scientific and Technical Information)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I139351228","host_organization_name":"Office of Scientific and Technical Information","host_organization_lineage":["https://openalex.org/I139351228"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":null},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1702950523","display_name":null,"funder_award_id":"W-7405","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G6006943239","display_name":null,"funder_award_id":"W-7405-ENG-36","funder_id":"https://openalex.org/F4320338304","funder_display_name":"Los Alamos National Laboratory"},{"id":"https://openalex.org/G797745481","display_name":null,"funder_award_id":"W-7405-ENG-36","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"}],"funders":[{"id":"https://openalex.org/F4320306078","display_name":"U.S. Department of Defense","ror":"https://ror.org/0447fe631"},{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"},{"id":"https://openalex.org/F4320337547","display_name":"Laboratory Directed Research and Development","ror":"https://ror.org/01e41cf67"},{"id":"https://openalex.org/F4320338304","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67"}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W2132914442.pdf"},"referenced_works_count":19,"referenced_works":["https://openalex.org/W1537929875","https://openalex.org/W1555915743","https://openalex.org/W1971952282","https://openalex.org/W2079712861","https://openalex.org/W2091327926","https://openalex.org/W2094526959","https://openalex.org/W2100970777","https://openalex.org/W2104994660","https://openalex.org/W2112121929","https://openalex.org/W2138509363","https://openalex.org/W2158272436","https://openalex.org/W2533204039","https://openalex.org/W2548680145","https://openalex.org/W2725179571","https://openalex.org/W4232225775","https://openalex.org/W6631987640","https://openalex.org/W6643050054","https://openalex.org/W6680456895","https://openalex.org/W6695275532"],"related_works":["https://openalex.org/W3081288631","https://openalex.org/W3152382318","https://openalex.org/W3004686567","https://openalex.org/W2738656338","https://openalex.org/W2064720525","https://openalex.org/W2111125783","https://openalex.org/W1862835629","https://openalex.org/W2136799148","https://openalex.org/W2897533804","https://openalex.org/W2890506991"],"abstract_inverted_index":{"Summary":[0],"form":[1],"only":[2],"given.":[3],"As":[4],"the":[5,29,39,45,52,77,87,90,128,145],"number":[6],"of":[7,15,25,27,31,66,80,89,92,138,147],"processors":[8],"for":[9,44],"multiteraflop":[10],"systems":[11,20],"grows":[12],"to":[13,22,60,69,74,100,123,140,163],"tens":[14],"thousands,":[16],"with":[17,76,173],"proposed":[18],"petaflops":[19],"likely":[21],"contain":[23],"hundreds":[24],"thousands":[26],"processors,":[28],"assumption":[30],"fully":[32],"reliable":[33],"hardware":[34,81,148],"has":[35],"been":[36],"abandoned.":[37],"Although":[38],"mean":[40],"time":[41],"between":[42],"failures":[43,91],"individual":[46],"components":[47],"can":[48],"be":[49],"very":[50,150],"high,":[51],"large":[53],"total":[54],"component":[55],"count":[56],"will":[57,84,109],"inevitably":[58],"lead":[59],"frequent":[61],"failures.":[62],"It":[63],"is":[64,171],"therefore":[65],"paramount":[67],"importance":[68],"develop":[70],"new":[71,112],"software":[72],"solutions":[73],"deal":[75],"unavoidable":[78],"reality":[79],"faults.":[82],"We":[83],"first":[85],"describe":[86],"nature":[88],"current":[93,174],"large-scale":[94],"machines,":[95],"and":[96,157],"extrapolate":[97],"these":[98],"results":[99,167],"future":[101],"machines.":[102],"Based":[103],"on":[104],"this":[105,170],"preliminary":[106],"analysis":[107],"we":[108,115],"present":[110],"a":[111],"technology":[113],"that":[114,169],"are":[116],"currently":[117],"developing,":[118],"buffered":[119],"coscheduling,":[120],"which":[121],"seeks":[122],"implement":[124],"fault":[125],"tolerance":[126],"at":[127],"operating":[129],"system":[130],"level.":[131],"Major":[132],"design":[133],"goals":[134],"include":[135],"dynamic":[136],"reallocation":[137],"resources":[139],"allow":[141],"continuing":[142],"execution":[143],"in":[144],"presence":[146],"failures,":[149],"high":[151,153],"scalability,":[152],"efficiency":[154],"(low":[155],"overhead),":[156],"transparency":[158],"-":[159],"requiring":[160],"no":[161],"changes":[162],"user":[164],"applications.":[165],"Preliminary":[166],"show":[168],"attainable":[172],"hardware.":[175]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":2},{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":4},{"year":2019,"cited_by_count":1},{"year":2018,"cited_by_count":2},{"year":2017,"cited_by_count":2},{"year":2014,"cited_by_count":3},{"year":2013,"cited_by_count":1},{"year":2012,"cited_by_count":1}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
