{"id":"https://openalex.org/W2035137413","doi":"https://doi.org/10.1145/2304576.2304589","title":"Data-driven fault tolerance for work stealing computations","display_name":"Data-driven fault tolerance for work stealing computations","publication_year":2012,"publication_date":"2012-06-25","ids":{"openalex":"https://openalex.org/W2035137413","doi":"https://doi.org/10.1145/2304576.2304589","mag":"2035137413"},"language":"en","primary_location":{"id":"doi:10.1145/2304576.2304589","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2304576.2304589","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 26th ACM international conference on Supercomputing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100695928","display_name":"Wenjing Ma","orcid":"https://orcid.org/0000-0002-1795-4498"},"institutions":[{"id":"https://openalex.org/I142606810","display_name":"Pacific Northwest National Laboratory","ror":"https://ror.org/05h992307","country_code":"US","type":"facility","lineage":["https://openalex.org/I1325736334","https://openalex.org/I1330989302","https://openalex.org/I142606810","https://openalex.org/I39565521"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Wenjing Ma","raw_affiliation_strings":["Pacific Northwest National Laboratory, Richland, WA, USA","Pacific Northwest National Laboratory, Richland, WA USA"],"affiliations":[{"raw_affiliation_string":"Pacific Northwest National Laboratory, Richland, WA, USA","institution_ids":["https://openalex.org/I142606810"]},{"raw_affiliation_string":"Pacific Northwest National Laboratory, Richland, WA USA","institution_ids":["https://openalex.org/I142606810"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5006650430","display_name":"Sriram Krishnamoorthy","orcid":"https://orcid.org/0000-0002-4682-1002"},"institutions":[{"id":"https://openalex.org/I142606810","display_name":"Pacific Northwest National Laboratory","ror":"https://ror.org/05h992307","country_code":"US","type":"facility","lineage":["https://openalex.org/I1325736334","https://openalex.org/I1330989302","https://openalex.org/I142606810","https://openalex.org/I39565521"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sriram Krishnamoorthy","raw_affiliation_strings":["Pacific Northwest National Laboratory, Richland, WA, USA","Pacific Northwest National Laboratory, Richland, WA USA"],"affiliations":[{"raw_affiliation_string":"Pacific Northwest National Laboratory, Richland, WA, USA","institution_ids":["https://openalex.org/I142606810"]},{"raw_affiliation_string":"Pacific Northwest National Laboratory, Richland, WA USA","institution_ids":["https://openalex.org/I142606810"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5100695928"],"corresponding_institution_ids":["https://openalex.org/I142606810"],"apc_list":null,"apc_paid":null,"fwci":4.6097,"has_fulltext":false,"cited_by_count":27,"citation_normalized_percentile":{"value":0.95001166,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"79","last_page":"90"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8556982278823853},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.7066376209259033},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.650041937828064},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5868334174156189},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.49015745520591736},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.48823145031929016},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.42926979064941406},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.4118577539920807},{"id":"https://openalex.org/keywords/real-time-computing","display_name":"Real-time computing","score":0.35443419218063354},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.3403470814228058},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.23311716318130493},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.1176123321056366},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.0994168221950531}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8556982278823853},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.7066376209259033},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.650041937828064},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5868334174156189},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.49015745520591736},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.48823145031929016},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.42926979064941406},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4118577539920807},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.35443419218063354},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3403470814228058},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.23311716318130493},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.1176123321056366},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.0994168221950531},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/2304576.2304589","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2304576.2304589","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 26th ACM international conference on Supercomputing","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W1493735863","https://openalex.org/W1527076549","https://openalex.org/W1576205104","https://openalex.org/W1576341769","https://openalex.org/W2042544282","https://openalex.org/W2048784665","https://openalex.org/W2063924830","https://openalex.org/W2072725684","https://openalex.org/W2083613288","https://openalex.org/W2087946700","https://openalex.org/W2090409324","https://openalex.org/W2096611465","https://openalex.org/W2100970777","https://openalex.org/W2105039796","https://openalex.org/W2107696263","https://openalex.org/W2108183412","https://openalex.org/W2108801243","https://openalex.org/W2114035455","https://openalex.org/W2116115793","https://openalex.org/W2119018856","https://openalex.org/W2146381930","https://openalex.org/W2147504831","https://openalex.org/W2148109481","https://openalex.org/W2153704625","https://openalex.org/W2155066383","https://openalex.org/W2161551433","https://openalex.org/W2161989797","https://openalex.org/W2173213060","https://openalex.org/W4234858060","https://openalex.org/W4242946001"],"related_works":["https://openalex.org/W2046435967","https://openalex.org/W4231775656","https://openalex.org/W2383646825","https://openalex.org/W2371018915","https://openalex.org/W2354191502","https://openalex.org/W1972225038","https://openalex.org/W2380573388","https://openalex.org/W3134658850","https://openalex.org/W2369636957","https://openalex.org/W2355938171"],"abstract_inverted_index":{"Work":[0],"stealing":[1,129],"is":[2,42,66,76],"a":[3,33,45,52],"promising":[4],"technique":[5],"to":[6,68,79,84,130,167],"dynamically":[7,131],"tolerate":[8],"variations":[9],"in":[10,51,87],"the":[11,63,82,88,133,136,141,151,157,163,172],"execution":[12],"environment,":[13],"including":[14],"faults,":[15],"system":[16],"noise,":[17],"and":[18,118,139,154,171],"energy":[19],"constraints.":[20],"In":[21],"this":[22],"paper,":[23],"we":[24],"present":[25,100],"fault":[26,158],"tolerance":[27,159],"mechanisms":[28],"for":[29],"task":[30],"parallel":[31],"computations,":[32],"popular":[34],"computation":[35,41],"idiom,":[36],"employing":[37],"work":[38,92,128,177],"stealing.":[39,93],"The":[40,56],"organized":[43],"as":[44],"collection":[46],"of":[47,58,90,156],"tasks":[48,83,134],"with":[49,106,114,175],"data":[50,59,72],"global":[53],"address":[54],"space.":[55],"completion":[57],"operations,":[60],"rather":[61],"than":[62],"actual":[64],"messages,":[65],"tracked":[67],"derive":[69],"an":[70],"idempotent":[71],"store.":[73],"This":[74],"information":[75],"also":[77],"used":[78],"accurately":[80],"identify":[81],"be":[85],"re-executed":[86],"presence":[89],"random":[91],"We":[94,125,148],"consider":[95],"three":[96,142],"recovery":[97,105,113,120],"schemes":[98,143],"that":[99,150],"distinct":[101],"trade-offs":[102],"---":[103],"lazy":[104],"potentially":[107],"increased":[108],"re-execution":[109],"cost,":[110],"immediate":[111],"collective":[112],"associated":[115],"synchronization":[116],"overheads,":[117],"noncollective":[119],"enabled":[121],"by":[122],"additional":[123],"communication.":[124],"employ":[126],"distributed-memory":[127],"rebalance":[132],"onto":[135],"live":[137],"processes":[138],"evaluate":[140],"using":[144],"candidate":[145],"application":[146],"benchmarks.":[147],"demonstrate":[149],"overheads":[152,173],"(space":[153],"time)":[155],"mechanism":[160],"are":[161,169],"low,":[162],"costs":[164],"incurred":[165],"due":[166],"failures":[168],"small,":[170],"decrease":[174],"per-process":[176],"at":[178],"scale.":[179]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":2},{"year":2019,"cited_by_count":1},{"year":2018,"cited_by_count":3},{"year":2017,"cited_by_count":3},{"year":2016,"cited_by_count":3},{"year":2015,"cited_by_count":3},{"year":2014,"cited_by_count":7},{"year":2013,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
