{"id":"https://openalex.org/W1602603475","doi":"https://doi.org/10.1109/hpcsim.2015.7237113","title":"Techniques to improve the scalability of collective checkpointing at large scale","display_name":"Techniques to improve the scalability of collective checkpointing at large scale","publication_year":2015,"publication_date":"2015-07-01","ids":{"openalex":"https://openalex.org/W1602603475","doi":"https://doi.org/10.1109/hpcsim.2015.7237113","mag":"1602603475"},"language":"en","primary_location":{"id":"doi:10.1109/hpcsim.2015.7237113","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpcsim.2015.7237113","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2015 International Conference on High Performance Computing &amp; Simulation (HPCS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5085745891","display_name":"Bogdan Nicolae","orcid":"https://orcid.org/0000-0002-0661-7509"},"institutions":[{"id":"https://openalex.org/I4210145784","display_name":"IBM Research - Ireland","ror":"https://ror.org/04jnxr720","country_code":"IE","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115","https://openalex.org/I4210145784"]},{"id":"https://openalex.org/I1341412227","display_name":"IBM (United States)","ror":"https://ror.org/05hh8d621","country_code":"US","type":"company","lineage":["https://openalex.org/I1341412227"]}],"countries":["IE","US"],"is_corresponding":true,"raw_author_name":"Bogdan Nicolae","raw_affiliation_strings":["IBM Research, Ireland","IBM Research - Ireland#TAB#"],"affiliations":[{"raw_affiliation_string":"IBM Research, Ireland","institution_ids":["https://openalex.org/I4210145784"]},{"raw_affiliation_string":"IBM Research - Ireland#TAB#","institution_ids":["https://openalex.org/I1341412227"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5085745891"],"corresponding_institution_ids":["https://openalex.org/I1341412227","https://openalex.org/I4210145784"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0430574,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"71","issue":null,"first_page":"660","last_page":"661"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.9944999814033508,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/petascale-computing","display_name":"Petascale computing","score":0.9642105102539062},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8065834045410156},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7820795774459839},{"id":"https://openalex.org/keywords/exascale-computing","display_name":"Exascale computing","score":0.6857291460037231},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.6761391162872314},{"id":"https://openalex.org/keywords/parallel-i/o","display_name":"Parallel I/O","score":0.5878728628158569},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5339314937591553},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.5267098546028137},{"id":"https://openalex.org/keywords/homogeneous","display_name":"Homogeneous","score":0.47324854135513306},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.4404182732105255},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.41965562105178833},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.41642531752586365},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.3808707296848297},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.3325650095939636},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.1704321801662445}],"concepts":[{"id":"https://openalex.org/C185410017","wikidata":"https://www.wikidata.org/wiki/Q7171778","display_name":"Petascale computing","level":3,"score":0.9642105102539062},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8065834045410156},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7820795774459839},{"id":"https://openalex.org/C2778837361","wikidata":"https://www.wikidata.org/wiki/Q2450880","display_name":"Exascale computing","level":3,"score":0.6857291460037231},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.6761391162872314},{"id":"https://openalex.org/C2781057727","wikidata":"https://www.wikidata.org/wiki/Q9941","display_name":"Parallel I/O","level":2,"score":0.5878728628158569},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5339314937591553},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.5267098546028137},{"id":"https://openalex.org/C66882249","wikidata":"https://www.wikidata.org/wiki/Q169336","display_name":"Homogeneous","level":2,"score":0.47324854135513306},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.4404182732105255},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.41965562105178833},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.41642531752586365},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3808707296848297},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.3325650095939636},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.1704321801662445},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpcsim.2015.7237113","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpcsim.2015.7237113","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2015 International Conference on High Performance Computing &amp; Simulation (HPCS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.6299999952316284,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":11,"referenced_works":["https://openalex.org/W1572669544","https://openalex.org/W1968075755","https://openalex.org/W1978807861","https://openalex.org/W1984564341","https://openalex.org/W2078349455","https://openalex.org/W2100970777","https://openalex.org/W2105524676","https://openalex.org/W2131645490","https://openalex.org/W2153958596","https://openalex.org/W4241332380","https://openalex.org/W6679471073"],"related_works":["https://openalex.org/W2021702679","https://openalex.org/W3038449658","https://openalex.org/W1582746211","https://openalex.org/W2266027327","https://openalex.org/W2249929881","https://openalex.org/W4289494037","https://openalex.org/W1569809235","https://openalex.org/W2278366184","https://openalex.org/W1602603475","https://openalex.org/W2532379532"],"abstract_inverted_index":{"Scientific":[0],"and":[1,17,24,56,63,270,299,432],"data-intensive":[2],"computing":[3],"have":[4],"matured":[5],"over":[6],"the":[7,37,40,152,155,200,215,224,234,257,266,273,287,302,318,325,329,351,363,379,384,401,407],"last":[8],"couple":[9],"of":[10,15,39,54,104,123,154,202,217,256,276],"years":[11],"in":[12,22,44,52,81,120,214,312,332,340,414],"all":[13,98],"fields":[14],"science":[16],"industry.":[18],"Their":[19],"rapid":[20],"increase":[21],"complexity":[23],"scale":[25],"has":[26,321],"prompted":[27],"ongoing":[28],"efforts":[29],"dedicated":[30],"to":[31,93,108,133,158,198,222,232,268,297,315,335,354,374,378,409,416],"reach":[32],"exascale":[33],"infrastructure":[34],"capability":[35],"by":[36,116],"end":[38],"decade.":[41],"However,":[42,281],"advances":[43],"this":[45,82,251,282,360,393,419],"context":[46,83],"are":[47,58,64,84,149,295,331,430],"not":[48,193,284,361,371],"homogeneous:":[49],"I/O":[50,86,196,225,247,274,292,356],"capabilities":[51],"terms":[53],"networking":[55],"storage":[57,172,186,239,279,289],"lagging":[59],"behind":[60],"computational":[61,136],"power":[62],"often":[65,114],"considered":[66],"a":[67,121,161,170,175,184,253,277,309,336,343,394,422],"major":[68],"limitation":[69],"that":[70,71,368],"persists":[72],"even":[73],"at":[74,138,349,411],"petascale":[75],"[1].":[76],"A":[77],"particularly":[78],"difficult":[79],"challenge":[80],"collective":[85,95,386],"access":[87],"patterns":[88],"(which":[89],"we":[90],"henceforth":[91],"refer":[92],"as":[94,180,189,300,400,433],"checkpointing)":[96],"where":[97],"processes":[99],"simultaneously":[100,107],"dump":[101,212,320],"large":[102,254],"amounts":[103],"related":[105],"data":[106,203,258,303],"persistent":[109],"storage.":[110],"This":[111],"pattern":[112],"is":[113,231,283,306,314,370,389],"exhibited":[115],"large-scale,":[117],"bulk-synchronous":[118],"applications":[119],"variety":[122],"circumstances,":[124,169],"e.g.,":[125],"when":[126,144],"they":[127,304],"use":[128,246],"checkpoint-restart":[129],"fault":[130],"tolerance":[131],"techniques":[132],"save":[134],"intermediate":[135],"states":[137],"regular":[139],"time":[140,373],"intervals":[141,413],"[2]":[142],"or":[143,183,245,291],"intermediate,":[145],"globally":[146],"synchronized":[147],"results":[148],"needed":[150],"during":[151,164],"lifetime":[153],"computation":[156],"(e.g.":[157,174],"understand":[159],"how":[160],"simulation":[162],"progresses":[163],"key":[165],"phases).":[166],"Under":[167],"such":[168,179,188,301,434],"decoupled":[171,278],"system":[173,178,187,339,382],"parallel":[176,337,380],"file":[177,338,381],"GPFS":[181],"[3]":[182],"specialized":[185],"BlobSeer":[190],"[4])":[191],"does":[192],"provide":[194],"sufficient":[195],"bandwidth":[197,226,275,357],"handle":[199],"explosion":[201],"sizes:":[204],"for":[205,272,418],"example,":[206],"Jones":[207],"et":[208],"al.":[209],"[5]":[210],"predict":[211],"times":[213],"order":[216,221,415],"several":[218],"hours.":[219],"In":[220,391],"overcome":[223],"limitation,":[227],"one":[228],"potential":[229],"solution":[230,345],"equip":[233],"compute":[235],"nodes":[236,294],"with":[237,397],"local":[238,288,319,428],"(i.e.,":[240],"HDDs,":[241],"SSDs,":[242],"NVMs,":[243],"etc.)":[244],"forwarding":[248,293],"nodes.":[249],"Using":[250],"approach,":[252],"part":[255],"can":[259,346],"be":[260,347],"dumped":[261,334],"locally,":[262],"which":[263,405],"completely":[264],"avoids":[265],"need":[267,408],"consume":[269],"compete":[271],"system.":[280],"without":[285],"drawbacks:":[286],"devices":[290],"prone":[296],"failures":[298],"hold":[305],"volatile.":[307],"Thus,":[308],"popular":[310],"approach":[311],"practice":[313],"wait":[316],"until":[317],"finished,":[322],"then":[323],"let":[324],"application":[326],"continue":[327],"while":[328],"checkpoints":[330],"turn":[333],"background.":[341],"Such":[342],"straightforward":[344],"effective":[348],"hiding":[350],"overhead":[352,436],"incurred":[353],"due":[355],"limitations,":[358],"but":[359],"necessarily":[362],"case:":[364],"it":[365],"may":[366],"happen":[367],"there":[369],"enough":[372],"fully":[375],"flush":[376],"everything":[377],"before":[383],"next":[385],"checkpoint":[387,410,424],"request":[388],"issued.":[390],"fact,":[392],"likely":[395],"scenario":[396],"growing":[398],"scale,":[399],"failure":[402],"rate":[403],"increases,":[404],"introduces":[406],"smaller":[412,423],"compensate":[417],"effect.":[420],"Furthermore,":[421],"interval":[425],"also":[426],"means":[427],"dumps":[429],"frequent":[431],"their":[435],"becomes":[437],"significant":[438],"itself.":[439]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
