{"id":"https://openalex.org/W2115408363","doi":"https://doi.org/10.1145/2370816.2370848","title":"Probabilistic diagnosis of performance faults in large-scale parallel applications","display_name":"Probabilistic diagnosis of performance faults in large-scale parallel applications","publication_year":2012,"publication_date":"2012-09-19","ids":{"openalex":"https://openalex.org/W2115408363","doi":"https://doi.org/10.1145/2370816.2370848","mag":"2115408363"},"language":"en","primary_location":{"id":"doi:10.1145/2370816.2370848","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2370816.2370848","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 21st international conference on Parallel architectures and compilation techniques","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5033868370","display_name":"Ignacio Laguna","orcid":"https://orcid.org/0000-0002-9374-4433"},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ignacio Laguna","raw_affiliation_strings":["Purdue University, West Lafayette, IN, USA","Purdue University, School of Electrical and Computer Engineering, West Lafayette, IN, 47907, USA"],"affiliations":[{"raw_affiliation_string":"Purdue University, West Lafayette, IN, USA","institution_ids":["https://openalex.org/I219193219"]},{"raw_affiliation_string":"Purdue University, School of Electrical and Computer Engineering, West Lafayette, IN, 47907, USA","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006598908","display_name":"Dong H. Ahn","orcid":"https://orcid.org/0000-0001-6722-0532"},"institutions":[{"id":"https://openalex.org/I1282311441","display_name":"Lawrence Livermore National Laboratory","ror":"https://ror.org/041nk4h53","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282311441","https://openalex.org/I1330989302","https://openalex.org/I198811213","https://openalex.org/I4210138311"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dong H. Ahn","raw_affiliation_strings":["Lawrence Livermore National Laboratory, Livermore, CA, USA","Lawrence Livermore National Laboratory, Computation Directorate, CA 94550, USA"],"affiliations":[{"raw_affiliation_string":"Lawrence Livermore National Laboratory, Livermore, CA, USA","institution_ids":["https://openalex.org/I1282311441"]},{"raw_affiliation_string":"Lawrence Livermore National Laboratory, Computation Directorate, CA 94550, USA","institution_ids":["https://openalex.org/I1282311441"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058719424","display_name":"Bronis R. de Supinski","orcid":"https://orcid.org/0000-0002-0339-1006"},"institutions":[{"id":"https://openalex.org/I1282311441","display_name":"Lawrence Livermore National Laboratory","ror":"https://ror.org/041nk4h53","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282311441","https://openalex.org/I1330989302","https://openalex.org/I198811213","https://openalex.org/I4210138311"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bronis R. de Supinski","raw_affiliation_strings":["Lawrence Livermore National Laboratory, Livermore, CA, USA","Lawrence Livermore National Laboratory, Computation Directorate, CA 94550, USA"],"affiliations":[{"raw_affiliation_string":"Lawrence Livermore National Laboratory, Livermore, CA, USA","institution_ids":["https://openalex.org/I1282311441"]},{"raw_affiliation_string":"Lawrence Livermore National Laboratory, Computation Directorate, CA 94550, USA","institution_ids":["https://openalex.org/I1282311441"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047310442","display_name":"Saurabh Bagchi","orcid":"https://orcid.org/0000-0002-4239-5632"},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Saurabh Bagchi","raw_affiliation_strings":["Purdue University, West Lafayette, IN, USA","Purdue University, School of Electrical and Computer Engineering, West Lafayette, IN, 47907, USA"],"affiliations":[{"raw_affiliation_string":"Purdue University, West Lafayette, IN, USA","institution_ids":["https://openalex.org/I219193219"]},{"raw_affiliation_string":"Purdue University, School of Electrical and Computer Engineering, West Lafayette, IN, 47907, USA","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5007709486","display_name":"Todd Gamblin","orcid":"https://orcid.org/0000-0002-7857-2805"},"institutions":[{"id":"https://openalex.org/I1282311441","display_name":"Lawrence Livermore National Laboratory","ror":"https://ror.org/041nk4h53","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282311441","https://openalex.org/I1330989302","https://openalex.org/I198811213","https://openalex.org/I4210138311"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Todd Gamblin","raw_affiliation_strings":["Lawrence Livermore National Laboratory, Livermore, CA, USA","Lawrence Livermore National Laboratory, Computation Directorate, CA 94550, USA"],"affiliations":[{"raw_affiliation_string":"Lawrence Livermore National Laboratory, Livermore, CA, USA","institution_ids":["https://openalex.org/I1282311441"]},{"raw_affiliation_string":"Lawrence Livermore National Laboratory, Computation Directorate, CA 94550, USA","institution_ids":["https://openalex.org/I1282311441"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5033868370"],"corresponding_institution_ids":["https://openalex.org/I219193219"],"apc_list":null,"apc_paid":null,"fwci":3.546,"has_fulltext":false,"cited_by_count":19,"citation_normalized_percentile":{"value":0.93364162,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"213","last_page":"222"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.860748827457428},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7058572769165039},{"id":"https://openalex.org/keywords/debugging","display_name":"Debugging","score":0.6885440945625305},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.6713615655899048},{"id":"https://openalex.org/keywords/program-slicing","display_name":"Program slicing","score":0.6472328901290894},{"id":"https://openalex.org/keywords/root-cause","display_name":"Root cause","score":0.576528012752533},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5740671753883362},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.5691130757331848},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5546492338180542},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.498335599899292},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.49306365847587585},{"id":"https://openalex.org/keywords/probabilistic-logic","display_name":"Probabilistic logic","score":0.4537714421749115},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4244583547115326},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.3581259250640869},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.24853989481925964},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.1547897756099701},{"id":"https://openalex.org/keywords/reliability-engineering","display_name":"Reliability engineering","score":0.14819759130477905},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.11272335052490234}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.860748827457428},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7058572769165039},{"id":"https://openalex.org/C168065819","wikidata":"https://www.wikidata.org/wiki/Q845566","display_name":"Debugging","level":2,"score":0.6885440945625305},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.6713615655899048},{"id":"https://openalex.org/C91071405","wikidata":"https://www.wikidata.org/wiki/Q1413145","display_name":"Program slicing","level":3,"score":0.6472328901290894},{"id":"https://openalex.org/C84945661","wikidata":"https://www.wikidata.org/wiki/Q7366567","display_name":"Root cause","level":2,"score":0.576528012752533},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5740671753883362},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.5691130757331848},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5546492338180542},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.498335599899292},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.49306365847587585},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.4537714421749115},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4244583547115326},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3581259250640869},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.24853989481925964},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.1547897756099701},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.14819759130477905},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.11272335052490234},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/2370816.2370848","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2370816.2370848","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 21st international conference on Parallel architectures and compilation techniques","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320338286","display_name":"Lawrence Livermore National Laboratory","ror":"https://ror.org/041nk4h53"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1553908659","https://openalex.org/W1967324265","https://openalex.org/W2024166759","https://openalex.org/W2036048726","https://openalex.org/W2055767681","https://openalex.org/W2056400780","https://openalex.org/W2073949144","https://openalex.org/W2075092012","https://openalex.org/W2079055089","https://openalex.org/W2105079569","https://openalex.org/W2112121929","https://openalex.org/W2120525022","https://openalex.org/W2120542024","https://openalex.org/W2122282651","https://openalex.org/W2123728588","https://openalex.org/W2124515646","https://openalex.org/W2125359382","https://openalex.org/W2131613942","https://openalex.org/W2132148774","https://openalex.org/W2140476198","https://openalex.org/W2144344516","https://openalex.org/W2149897682","https://openalex.org/W2157593310","https://openalex.org/W2160097679","https://openalex.org/W2161694060","https://openalex.org/W2161957670","https://openalex.org/W2293624369","https://openalex.org/W2913256667","https://openalex.org/W6631615826"],"related_works":["https://openalex.org/W2080412071","https://openalex.org/W2011854888","https://openalex.org/W1996791333","https://openalex.org/W2026117800","https://openalex.org/W4234735799","https://openalex.org/W2764797736","https://openalex.org/W772318632","https://openalex.org/W1528649993","https://openalex.org/W3022626626","https://openalex.org/W1880733898"],"abstract_inverted_index":{"Debugging":[0],"large-scale":[1],"parallel":[2],"applications":[3],"is":[4],"challenging.":[5],"Most":[6,22],"existing":[7],"techniques":[8],"provide":[9],"mechanisms":[10],"for":[11],"process":[12],"control":[13],"but":[14],"little":[15],"information":[16],"about":[17],"the":[18,57,98,125,128,137,147],"causes":[19],"of":[20,67,101,127,152,157],"failures.":[21],"debuggers":[23],"also":[24],"scale":[25,108],"poorly":[26],"despite":[27],"continued":[28],"growth":[29],"in":[30,61,109,141,149],"supercomputer":[31],"core":[32],"counts.":[33],"Our":[34,53,130],"novel,":[35],"highly":[36],"scalable":[37],"tool":[38,54,95],"helps":[39],"developers":[40],"to":[41,44,78],"understand":[42],"and":[43,48,70,123,144],"fix":[45],"performance":[46],"failures":[47],"correctness":[49],"problems":[50],"at":[51,107],"scale.":[52],"probabilistically":[55],"infers":[56],"least":[58,138],"progressed":[59,139],"task":[60,140],"MPI":[62],"programs":[63],"using":[64],"Markov":[65],"models":[66],"execution":[68],"history":[69],"dependence":[71],"analysis.":[72],"This":[73],"analysis":[74],"guides":[75],"program":[76],"slicing":[77],"find":[79],"code":[80],"that":[81,93,133],"may":[82],"have":[83],"caused":[84],"a":[85,88,102,110,150,153],"failure.":[86],"In":[87],"blind":[89],"study,":[90],"we":[91,115],"demonstrate":[92],"our":[94],"can":[96,145],"isolate":[97],"root":[99],"cause":[100],"particularly":[103],"perplexing":[104],"bug":[105],"encountered":[106],"molecular":[111],"dynamics":[112],"simulation.":[113],"Further,":[114],"perform":[116,146],"fault":[117],"injections":[118],"into":[119],"two":[120],"benchmark":[121],"codes":[122],"measure":[124],"scalability":[126],"tool.":[129],"results":[131],"show":[132],"it":[134],"accurately":[135],"detects":[136],"most":[142],"cases":[143],"diagnosis":[148],"fraction":[151],"second":[154],"with":[155],"thousands":[156],"tasks.":[158]},"counts_by_year":[{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":1},{"year":2018,"cited_by_count":2},{"year":2017,"cited_by_count":3},{"year":2016,"cited_by_count":2},{"year":2015,"cited_by_count":4},{"year":2014,"cited_by_count":4},{"year":2013,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
