{"id":"https://openalex.org/W2128854702","doi":"https://doi.org/10.1145/1065944.1065973","title":"Fault tolerant high performance computing by a coding approach","display_name":"Fault tolerant high performance computing by a coding approach","publication_year":2005,"publication_date":"2005-06-15","ids":{"openalex":"https://openalex.org/W2128854702","doi":"https://doi.org/10.1145/1065944.1065973","mag":"2128854702"},"language":"en","primary_location":{"id":"doi:10.1145/1065944.1065973","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1065944.1065973","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the tenth ACM SIGPLAN symposium on Principles and practice of parallel programming","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5061737717","display_name":"Zizhong Chen","orcid":"https://orcid.org/0000-0003-2578-4940"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Zizhong Chen","raw_affiliation_strings":["University of Tennessee, Knoxville, TN","University of Tennessee, Knoxville, TN;"],"affiliations":[{"raw_affiliation_string":"University of Tennessee, Knoxville, TN","institution_ids":["https://openalex.org/I75027704"]},{"raw_affiliation_string":"University of Tennessee, Knoxville, TN;","institution_ids":["https://openalex.org/I75027704"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005256587","display_name":"Graham E. Fagg","orcid":null},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Graham E. Fagg","raw_affiliation_strings":["University of Tennessee, Knoxville, TN","University of Tennessee, Knoxville, TN;"],"affiliations":[{"raw_affiliation_string":"University of Tennessee, Knoxville, TN","institution_ids":["https://openalex.org/I75027704"]},{"raw_affiliation_string":"University of Tennessee, Knoxville, TN;","institution_ids":["https://openalex.org/I75027704"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103111823","display_name":"Edgar Gabriel","orcid":"https://orcid.org/0000-0002-2541-1370"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Edgar Gabriel","raw_affiliation_strings":["University of Tennessee, Knoxville, TN","University of Tennessee, Knoxville, TN;"],"affiliations":[{"raw_affiliation_string":"University of Tennessee, Knoxville, TN","institution_ids":["https://openalex.org/I75027704"]},{"raw_affiliation_string":"University of Tennessee, Knoxville, TN;","institution_ids":["https://openalex.org/I75027704"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020584809","display_name":"Julien Langou","orcid":"https://orcid.org/0000-0002-7803-1822"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Julien Langou","raw_affiliation_strings":["University of Tennessee, Knoxville, TN","University of Tennessee, Knoxville, TN;"],"affiliations":[{"raw_affiliation_string":"University of Tennessee, Knoxville, TN","institution_ids":["https://openalex.org/I75027704"]},{"raw_affiliation_string":"University of Tennessee, Knoxville, TN;","institution_ids":["https://openalex.org/I75027704"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035177658","display_name":"Thara Angskun","orcid":"https://orcid.org/0000-0003-3677-8327"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Thara Angskun","raw_affiliation_strings":["University of Tennessee, Knoxville, TN","University of Tennessee, Knoxville, TN;"],"affiliations":[{"raw_affiliation_string":"University of Tennessee, Knoxville, TN","institution_ids":["https://openalex.org/I75027704"]},{"raw_affiliation_string":"University of Tennessee, Knoxville, TN;","institution_ids":["https://openalex.org/I75027704"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010055736","display_name":"George Bosilca","orcid":"https://orcid.org/0000-0003-2411-8495"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"George Bosilca","raw_affiliation_strings":["University of Tennessee, Knoxville, TN","University of Tennessee, Knoxville, TN;"],"affiliations":[{"raw_affiliation_string":"University of Tennessee, Knoxville, TN","institution_ids":["https://openalex.org/I75027704"]},{"raw_affiliation_string":"University of Tennessee, Knoxville, TN;","institution_ids":["https://openalex.org/I75027704"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5075517045","display_name":"Jack Dongarra","orcid":"https://orcid.org/0000-0003-3247-1782"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jack Dongarra","raw_affiliation_strings":["University of Tennessee, Knoxville, TN","University of Tennessee, Knoxville, TN;"],"affiliations":[{"raw_affiliation_string":"University of Tennessee, Knoxville, TN","institution_ids":["https://openalex.org/I75027704"]},{"raw_affiliation_string":"University of Tennessee, Knoxville, TN;","institution_ids":["https://openalex.org/I75027704"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5061737717"],"corresponding_institution_ids":["https://openalex.org/I75027704"],"apc_list":null,"apc_paid":null,"fwci":4.9271,"has_fulltext":false,"cited_by_count":95,"citation_normalized_percentile":{"value":0.95624729,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"213","last_page":"223"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.827103853225708},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.6293780207633972},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.5756215453147888},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5567229390144348},{"id":"https://openalex.org/keywords/floating-point","display_name":"Floating point","score":0.5242348313331604},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.5178871154785156},{"id":"https://openalex.org/keywords/node","display_name":"Node (physics)","score":0.4596019387245178},{"id":"https://openalex.org/keywords/erasure-code","display_name":"Erasure code","score":0.4535733461380005},{"id":"https://openalex.org/keywords/computer-engineering","display_name":"Computer engineering","score":0.4357423186302185},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.3716616630554199},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.31579142808914185},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.21085166931152344}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.827103853225708},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.6293780207633972},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.5756215453147888},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5567229390144348},{"id":"https://openalex.org/C84211073","wikidata":"https://www.wikidata.org/wiki/Q117879","display_name":"Floating point","level":2,"score":0.5242348313331604},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.5178871154785156},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.4596019387245178},{"id":"https://openalex.org/C137529215","wikidata":"https://www.wikidata.org/wiki/Q5385031","display_name":"Erasure code","level":3,"score":0.4535733461380005},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.4357423186302185},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3716616630554199},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.31579142808914185},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.21085166931152344},{"id":"https://openalex.org/C66938386","wikidata":"https://www.wikidata.org/wiki/Q633538","display_name":"Structural engineering","level":1,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/1065944.1065973","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1065944.1065973","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the tenth ACM SIGPLAN symposium on Principles and practice of parallel programming","raw_type":"proceedings-article"},{"id":"pmh:oai:pure.atira.dk:openaire_cris_publications/da937a6b-f042-45ab-b71f-da7c48a7e52c","is_oa":false,"landing_page_url":"https://research.manchester.ac.uk/en/publications/da937a6b-f042-45ab-b71f-da7c48a7e52c","pdf_url":null,"source":{"id":"https://openalex.org/S4306400662","display_name":"Research Explorer (The University of Manchester)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I28407311","host_organization_name":"University of Manchester","host_organization_lineage":["https://openalex.org/I28407311"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Chen, Z, Fagg, G E, Gabriel, E, Langou, J, Angskun, T, Bosilca, G & Dongarra, J 2005, Fault tolerant high performance computing by a coding approach. in Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPOPP|Proc ACM SIGPLAN Symp Prins Pract Parall Program PPOPP. Association for Computing Machinery, pp. 213-223, 2005 ACM SIGPLAN Symposium on Principles and Practise of Parallel Programming, PROPP 05, Chicago, IL, 1/07/05. < http://dblp.uni-trier.de/db/conf/ppopp/ppopp2005.html#ChenFGLABD05 >","raw_type":"info:eu-repo/semantics/conferenceObject"},{"id":"pmh:oai:pure.atira.dk:publications/da937a6b-f042-45ab-b71f-da7c48a7e52c","is_oa":false,"landing_page_url":"https://www.research.manchester.ac.uk/portal/en/publications/fault-tolerant-high-performance-computing-by-a-coding-approach(da937a6b-f042-45ab-b71f-da7c48a7e52c).html","pdf_url":null,"source":{"id":"https://openalex.org/S4306400662","display_name":"Research Explorer (The University of Manchester)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I28407311","host_organization_name":"University of Manchester","host_organization_lineage":["https://openalex.org/I28407311"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Chen, Z, Fagg, G E, Gabriel, E, Langou, J, Angskun, T, Bosilca, G & Dongarra, J 2005, Fault tolerant high performance computing by a coding approach. in Proceedings of the ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, PPOPP|Proc ACM SIGPLAN Symp Prins Pract Parall Program PPOPP. Association for Computing Machinery, pp. 213-223, 2005 ACM SIGPLAN Symposium on Principles and Practise of Parallel Programming, PROPP 05, Chicago, IL, 1/07/05. < http://dblp.uni-trier.de/db/conf/ppopp/ppopp2005.html#ChenFGLABD05 >","raw_type":"info:eu-repo/semantics/conferenceObject"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W81486879","https://openalex.org/W169659540","https://openalex.org/W1575350781","https://openalex.org/W1575701986","https://openalex.org/W1576341769","https://openalex.org/W1607166357","https://openalex.org/W1971952282","https://openalex.org/W1974401809","https://openalex.org/W1974755392","https://openalex.org/W1986009243","https://openalex.org/W2017838136","https://openalex.org/W2033656974","https://openalex.org/W2068092015","https://openalex.org/W2081612620","https://openalex.org/W2091257550","https://openalex.org/W2102469268","https://openalex.org/W2125488852","https://openalex.org/W2130076109","https://openalex.org/W2133287150","https://openalex.org/W2146942047","https://openalex.org/W2156074676","https://openalex.org/W2165863720","https://openalex.org/W2294787855","https://openalex.org/W2296772319","https://openalex.org/W2885267195","https://openalex.org/W6673424395"],"related_works":["https://openalex.org/W2384867379","https://openalex.org/W4400094300","https://openalex.org/W2329539859","https://openalex.org/W2227905990","https://openalex.org/W2765823764","https://openalex.org/W3214280620","https://openalex.org/W3191490922","https://openalex.org/W2619878113","https://openalex.org/W4280610722","https://openalex.org/W3185228140"],"abstract_inverted_index":{"As":[0],"the":[1,13,23,73,82,85,110,138,141,157,170,183,196,199,209],"number":[2,221],"of":[3,15,26,84,112,143,182,202,222],"processors":[4],"in":[5,123,147],"today's":[6,34,50],"high":[7,29,51,94,148],"performance":[8,30,52,95,149,197,228],"computers":[9,17],"continues":[10],"to":[11,40,67,90,103,136,217],"grow,":[12],"mean-time-to-failure":[14],"these":[16],"are":[18,36],"becoming":[19],"significantly":[20],"shorter":[21],"than":[22],"execution":[24],"time":[25],"many":[27],"current":[28],"computing":[31,53,96,150],"applications.":[32,151],"Although":[33],"architectures":[35],"usually":[37],"robust":[38],"enough":[39],"survive":[41,57,218],"node":[42,58,64,104,224],"failures":[43,59,105,225],"without":[44,106],"suffering":[45],"complete":[46],"system":[47],"failure,":[48],"most":[49],"applications":[54,97],"can":[55,101],"not":[56],"and,":[60],"therefore,":[61],"whenever":[62],"a":[63,76,178,188,219],"fails,":[65],"have":[66],"abort":[68],"themselves":[69],"and":[70,140,168,193,198,230],"restart":[71],"from":[72,133],"beginning":[74],"or":[75],"stable-storage-based":[77],"checkpoint.":[78],"This":[79,130],"paper":[80],"explores":[81],"use":[83,111],"floating-point":[86,161,179,211],"arithmetic":[87,162,180,212],"coding":[88,163,185,213],"approach":[89,164,214],"build":[91],"fault":[92],"survivable":[93],"so":[98],"that":[99,208],"they":[100],"adapt":[102],"aborting":[107],"themselves.":[108],"Despite":[109],"erasure":[113],"codes":[114,146],"over":[115],"Galois":[116],"field":[117],"has":[118],"been":[119],"theoretically":[120],"attempted":[121],"before":[122],"diskless":[124,166],"checkpointing,":[125],"few":[126],"actual":[127],"implementations":[128],"exist.":[129],"probably":[131],"derives":[132],"concerns":[134],"related":[135],"both":[137,195],"efficiency":[139],"complexity":[142],"implementing":[144],"such":[145],"In":[152],"this":[153,203],"paper,":[154],"we":[155],"introduce":[156],"simple":[158],"but":[159],"efficient":[160],"into":[165,187],"checkpointing":[167],"address":[169],"associated":[171],"round-off":[172],"error":[173],"issue.":[174],"We":[175],"also":[176],"implement":[177],"version":[181],"Reed-Solomon":[184],"scheme":[186],"conjugate":[189],"gradient":[190],"equation":[191],"solver":[192],"evaluate":[194],"numerical":[200,232],"impact":[201],"scheme.":[204],"Experimental":[205],"results":[206],"demonstrate":[207],"proposed":[210],"is":[215],"able":[216],"small":[220],"simultaneous":[223],"with":[226],"low":[227],"overhead":[229],"little":[231],"impact.":[233],"Copyright":[234],"2005":[235],"ACM.":[236]},"counts_by_year":[{"year":2024,"cited_by_count":2},{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":2},{"year":2019,"cited_by_count":4},{"year":2018,"cited_by_count":4},{"year":2017,"cited_by_count":4},{"year":2016,"cited_by_count":6},{"year":2015,"cited_by_count":7},{"year":2014,"cited_by_count":4},{"year":2013,"cited_by_count":9},{"year":2012,"cited_by_count":6}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
