{"id":"https://openalex.org/W1956304975","doi":"https://doi.org/10.1109/cluster.2015.106","title":"Building a Fault Tolerant Application Using the GASPI Communication Layer","display_name":"Building a Fault Tolerant Application Using the GASPI Communication Layer","publication_year":2015,"publication_date":"2015-09-01","ids":{"openalex":"https://openalex.org/W1956304975","doi":"https://doi.org/10.1109/cluster.2015.106","mag":"1956304975"},"language":"en","primary_location":{"id":"doi:10.1109/cluster.2015.106","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cluster.2015.106","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2015 IEEE International Conference on Cluster Computing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102889958","display_name":"Faisal Shahzad","orcid":"https://orcid.org/0000-0002-6766-7622"},"institutions":[{"id":"https://openalex.org/I181369854","display_name":"Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg","ror":"https://ror.org/00f7hpc57","country_code":"DE","type":"education","lineage":["https://openalex.org/I181369854"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Faisal Shahzad","raw_affiliation_strings":["Erlangen Regional Comput. Center, Univ. of Erlangen-Nuremberg, Erlangen, Germany"],"affiliations":[{"raw_affiliation_string":"Erlangen Regional Comput. Center, Univ. of Erlangen-Nuremberg, Erlangen, Germany","institution_ids":["https://openalex.org/I181369854"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038048025","display_name":"Moritz Kreutzer","orcid":"https://orcid.org/0000-0002-7822-9468"},"institutions":[{"id":"https://openalex.org/I181369854","display_name":"Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg","ror":"https://ror.org/00f7hpc57","country_code":"DE","type":"education","lineage":["https://openalex.org/I181369854"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Moritz Kreutzer","raw_affiliation_strings":["Erlangen Regional Comput. Center, Univ. of Erlangen-Nuremberg, Erlangen, Germany"],"affiliations":[{"raw_affiliation_string":"Erlangen Regional Comput. Center, Univ. of Erlangen-Nuremberg, Erlangen, Germany","institution_ids":["https://openalex.org/I181369854"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112049374","display_name":"Thomas Zeiser","orcid":"https://orcid.org/0009-0002-2916-911X"},"institutions":[{"id":"https://openalex.org/I181369854","display_name":"Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg","ror":"https://ror.org/00f7hpc57","country_code":"DE","type":"education","lineage":["https://openalex.org/I181369854"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Thomas Zeiser","raw_affiliation_strings":["Erlangen Regional Comput. Center, Univ. of Erlangen-Nuremberg, Erlangen, Germany"],"affiliations":[{"raw_affiliation_string":"Erlangen Regional Comput. Center, Univ. of Erlangen-Nuremberg, Erlangen, Germany","institution_ids":["https://openalex.org/I181369854"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088123154","display_name":"Rui Machado","orcid":"https://orcid.org/0009-0009-2759-2302"},"institutions":[{"id":"https://openalex.org/I3019415892","display_name":"Fraunhofer Institute for Industrial Mathematics","ror":"https://ror.org/019hjw009","country_code":"DE","type":"facility","lineage":["https://openalex.org/I3019415892","https://openalex.org/I4923324"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Rui Machado","raw_affiliation_strings":["ITWM, Fraunhofer Inst. for Ind. Math., Kaiserslautern, Germany"],"affiliations":[{"raw_affiliation_string":"ITWM, Fraunhofer Inst. for Ind. Math., Kaiserslautern, Germany","institution_ids":["https://openalex.org/I3019415892"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066751418","display_name":"Andreas Pieper","orcid":"https://orcid.org/0000-0002-9054-7274"},"institutions":[{"id":"https://openalex.org/I36522303","display_name":"Universit\u00e4t Greifswald","ror":"https://ror.org/00r1edq15","country_code":"DE","type":"education","lineage":["https://openalex.org/I36522303"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Andreas Pieper","raw_affiliation_strings":["Inst. of Phys., Univ. of Greifswald, Greifswald, Germany"],"affiliations":[{"raw_affiliation_string":"Inst. of Phys., Univ. of Greifswald, Greifswald, Germany","institution_ids":["https://openalex.org/I36522303"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082552227","display_name":"Georg Hager","orcid":"https://orcid.org/0000-0002-8723-2781"},"institutions":[{"id":"https://openalex.org/I181369854","display_name":"Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg","ror":"https://ror.org/00f7hpc57","country_code":"DE","type":"education","lineage":["https://openalex.org/I181369854"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Georg Hager","raw_affiliation_strings":["Erlangen Regional Comput. Center, Univ. of Erlangen-Nuremberg, Erlangen, Germany"],"affiliations":[{"raw_affiliation_string":"Erlangen Regional Comput. Center, Univ. of Erlangen-Nuremberg, Erlangen, Germany","institution_ids":["https://openalex.org/I181369854"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5070209050","display_name":"Gerhard Wellein","orcid":"https://orcid.org/0000-0001-7371-3026"},"institutions":[{"id":"https://openalex.org/I181369854","display_name":"Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg","ror":"https://ror.org/00f7hpc57","country_code":"DE","type":"education","lineage":["https://openalex.org/I181369854"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Gerhard Wellein","raw_affiliation_strings":["Erlangen Regional Comput. Center, Univ. of Erlangen-Nuremberg, Erlangen, Germany"],"affiliations":[{"raw_affiliation_string":"Erlangen Regional Comput. Center, Univ. of Erlangen-Nuremberg, Erlangen, Germany","institution_ids":["https://openalex.org/I181369854"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5102889958"],"corresponding_institution_ids":["https://openalex.org/I181369854"],"apc_list":null,"apc_paid":null,"fwci":2.3297,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.89477318,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"580","last_page":"587"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9876999855041504,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7881799936294556},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.7764387130737305},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.7418313026428223},{"id":"https://openalex.org/keywords/partitioned-global-address-space","display_name":"Partitioned global address space","score":0.7217659950256348},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6740354299545288},{"id":"https://openalex.org/keywords/software-fault-tolerance","display_name":"Software fault tolerance","score":0.6264011859893799},{"id":"https://openalex.org/keywords/mean-time-between-failures","display_name":"Mean time between failures","score":0.5993596315383911},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.4922598898410797},{"id":"https://openalex.org/keywords/fault-coverage","display_name":"Fault coverage","score":0.4849407970905304},{"id":"https://openalex.org/keywords/fault-detection-and-isolation","display_name":"Fault detection and isolation","score":0.45064055919647217},{"id":"https://openalex.org/keywords/fault","display_name":"Fault (geology)","score":0.4348936676979065},{"id":"https://openalex.org/keywords/reliability-engineering","display_name":"Reliability engineering","score":0.4241725206375122},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4207516014575958},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.41411978006362915},{"id":"https://openalex.org/keywords/failure-rate","display_name":"Failure rate","score":0.1520257294178009},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.11367148160934448},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.09989207983016968},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.08882862329483032}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7881799936294556},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.7764387130737305},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.7418313026428223},{"id":"https://openalex.org/C60832428","wikidata":"https://www.wikidata.org/wiki/Q945818","display_name":"Partitioned global address space","level":3,"score":0.7217659950256348},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6740354299545288},{"id":"https://openalex.org/C50712370","wikidata":"https://www.wikidata.org/wiki/Q4269346","display_name":"Software fault tolerance","level":3,"score":0.6264011859893799},{"id":"https://openalex.org/C44154001","wikidata":"https://www.wikidata.org/wiki/Q754940","display_name":"Mean time between failures","level":3,"score":0.5993596315383911},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4922598898410797},{"id":"https://openalex.org/C126953365","wikidata":"https://www.wikidata.org/wiki/Q5438152","display_name":"Fault coverage","level":3,"score":0.4849407970905304},{"id":"https://openalex.org/C152745839","wikidata":"https://www.wikidata.org/wiki/Q5438153","display_name":"Fault detection and isolation","level":3,"score":0.45064055919647217},{"id":"https://openalex.org/C175551986","wikidata":"https://www.wikidata.org/wiki/Q47089","display_name":"Fault (geology)","level":2,"score":0.4348936676979065},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.4241725206375122},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4207516014575958},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.41411978006362915},{"id":"https://openalex.org/C163164238","wikidata":"https://www.wikidata.org/wiki/Q2737027","display_name":"Failure rate","level":2,"score":0.1520257294178009},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.11367148160934448},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.09989207983016968},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.08882862329483032},{"id":"https://openalex.org/C172707124","wikidata":"https://www.wikidata.org/wiki/Q423488","display_name":"Actuator","level":2,"score":0.0},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.0},{"id":"https://openalex.org/C165205528","wikidata":"https://www.wikidata.org/wiki/Q83371","display_name":"Seismology","level":1,"score":0.0},{"id":"https://openalex.org/C134146338","wikidata":"https://www.wikidata.org/wiki/Q1815901","display_name":"Electronic circuit","level":2,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/cluster.2015.106","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cluster.2015.106","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2015 IEEE International Conference on Cluster Computing","raw_type":"proceedings-article"},{"id":"pmh:oai:fraunhofer.de:N-374947","is_oa":false,"landing_page_url":"http://publica.fraunhofer.de/documents/N-374947.html","pdf_url":null,"source":{"id":"https://openalex.org/S4306400801","display_name":"Publikationsdatenbank der Fraunhofer-Gesellschaft (Fraunhofer-Gesellschaft)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4923324","host_organization_name":"Fraunhofer-Gesellschaft","host_organization_lineage":["https://openalex.org/I4923324"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Fraunhofer ITWM","raw_type":"Conference Paper"},{"id":"pmh:oai:publica.fraunhofer.de:publica/390698","is_oa":false,"landing_page_url":"https://publica.fraunhofer.de/handle/publica/390698","pdf_url":null,"source":{"id":"https://openalex.org/S4306400318","display_name":"Fraunhofer-Publica (Fraunhofer-Gesellschaft)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4923324","host_organization_name":"Fraunhofer-Gesellschaft","host_organization_lineage":["https://openalex.org/I4923324"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"conference paper"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W141129880","https://openalex.org/W1981432246","https://openalex.org/W1984564341","https://openalex.org/W1986905947","https://openalex.org/W1998221613","https://openalex.org/W2000870360","https://openalex.org/W2023324599","https://openalex.org/W2044681268","https://openalex.org/W2067474224","https://openalex.org/W2084293824","https://openalex.org/W2089536264","https://openalex.org/W2105039796","https://openalex.org/W2128577831","https://openalex.org/W2133943294","https://openalex.org/W2168405877","https://openalex.org/W2730779250","https://openalex.org/W4239022315","https://openalex.org/W4245507143","https://openalex.org/W6646345524","https://openalex.org/W6684440641","https://openalex.org/W6740868767","https://openalex.org/W7046307239"],"related_works":["https://openalex.org/W2971479921","https://openalex.org/W3145923041","https://openalex.org/W2946906624","https://openalex.org/W841176518","https://openalex.org/W2101077206","https://openalex.org/W2157727563","https://openalex.org/W2470343202","https://openalex.org/W1978919910","https://openalex.org/W1488443159","https://openalex.org/W2005196107"],"abstract_inverted_index":{"It":[0,110],"is":[1,32,40,66,86,99,176,221],"commonly":[2],"agreed":[3],"that":[4,35],"highly":[5],"parallel":[6],"software":[7],"on":[8,42,106,138,160],"Exascale":[9],"computers":[10],"will":[11],"suffer":[12],"from":[13],"many":[14],"more":[15],"runtime":[16],"failures":[17,28],"due":[18],"to":[19,27,60,115,145,189],"the":[20,24,44,74,82,107,112,117,158,161,168,173,191,194,215],"decreasing":[21],"trend":[22],"in":[23,43,71,88,128,178,207,211],"mean":[25],"time":[26],"(MTTF).":[29],"Therefore,":[30],"it":[31],"not":[33,67],"surprising":[34],"a":[36,55,100,148,182],"lot":[37],"of":[38,46,119,123,165,170,193,213,222],"research":[39],"going":[41],"area":[45],"fault":[47,50,126,151,201],"tolerance":[48,127],"and":[49,142,172,218,226],"mitigation.":[51],"Applications":[52],"should":[53],"survive":[54],"failure":[56,216],"and/or":[57],"be":[58],"able":[59],"recover":[61,157],"with":[62],"minimal":[63],"cost.":[64],"MPI":[65],"yet":[68],"very":[69],"mature":[70],"handling":[72],"failures,":[73],"User-Level":[75],"Failure":[76],"Mitigation":[77],"(ULFM)":[78],"proposal":[79],"being":[80],"currently":[81],"most":[83],"promising":[84],"approach":[85],"still":[87],"its":[89,129],"prototype":[90],"phase.":[91],"In":[92],"our":[93],"work":[94],"we":[95,132,135],"use":[96,181],"GASPI,":[97],"which":[98],"relatively":[101],"new":[102],"communication":[103],"library":[104],"based":[105,187],"PGAS":[108],"model.":[109],"provides":[111],"missing":[113],"features":[114],"allow":[116,146],"design":[118],"fault-tolerant":[120],"applications.":[121],"Instead":[122],"introducing":[124],"algorithm-based":[125],"true":[130],"sense,":[131],"demonstrate":[133],"how":[134],"can":[136],"build":[137],"(existing)":[139],"clever":[140],"checkpointing":[141],"extend":[143],"applications":[144],"integrate":[147],"low":[149],"cost":[150,220],"detection":[152,202,217],"mechanism":[153,175,203],"and,":[154],"if":[155],"necessary,":[156],"application":[159,188],"fly.":[162],"The":[163],"aspects":[164],"process":[166],"management,":[167],"restoration":[169],"groups":[171],"recovery":[174,219],"presented":[177],"detail.":[179],"We":[180],"sparse":[183],"matrix":[184],"vector":[185],"multiplication":[186],"perform":[190],"analysis":[192],"overhead":[195,206],"introduced":[196],"by":[197],"such":[198],"modifications.":[199],"Our":[200],"causes":[204],"no":[205],"failure-free":[208],"cases,":[209],"whereas":[210],"case":[212],"failure(s),":[214],"reasonably":[223],"acceptable":[224],"order":[225],"shows":[227],"good":[228],"scalability.":[229]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2021,"cited_by_count":1},{"year":2018,"cited_by_count":2},{"year":2017,"cited_by_count":1},{"year":2016,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
