{"id":"https://openalex.org/W2127253816","doi":"https://doi.org/10.1109/rivf.2009.5174661","title":"Trouble Dashboard: A Distributed Failure Monitoring System for High-End Computing","display_name":"Trouble Dashboard: A Distributed Failure Monitoring System for High-End Computing","publication_year":2009,"publication_date":"2009-01-01","ids":{"openalex":"https://openalex.org/W2127253816","doi":"https://doi.org/10.1109/rivf.2009.5174661","mag":"2127253816"},"language":"en","primary_location":{"id":"doi:10.1109/rivf.2009.5174661","is_oa":false,"landing_page_url":"https://doi.org/10.1109/rivf.2009.5174661","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2009 IEEE-RIVF International Conference on Computing and Communication Technologies","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101511221","display_name":"Thanh Do","orcid":"https://orcid.org/0000-0001-9893-5725"},"institutions":[{"id":"https://openalex.org/I64519617","display_name":"Hanoi University","ror":"https://ror.org/01mxx0e62","country_code":"VN","type":"education","lineage":["https://openalex.org/I64519617"]}],"countries":["VN"],"is_corresponding":true,"raw_author_name":"Thanh Do","raw_affiliation_strings":["Department of Information Systems, Hanoi University of Technology, Hanoi, Vietnam"],"affiliations":[{"raw_affiliation_string":"Department of Information Systems, Hanoi University of Technology, Hanoi, Vietnam","institution_ids":["https://openalex.org/I64519617"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101965727","display_name":"Thuy Nguyen","orcid":"https://orcid.org/0000-0002-6283-7602"},"institutions":[{"id":"https://openalex.org/I64519617","display_name":"Hanoi University","ror":"https://ror.org/01mxx0e62","country_code":"VN","type":"education","lineage":["https://openalex.org/I64519617"]}],"countries":["VN"],"is_corresponding":false,"raw_author_name":"Thuy Nguyen","raw_affiliation_strings":["Department of Information Systems, Hanoi University of Technology, Hanoi, Vietnam"],"affiliations":[{"raw_affiliation_string":"Department of Information Systems, Hanoi University of Technology, Hanoi, Vietnam","institution_ids":["https://openalex.org/I64519617"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100690905","display_name":"Dung Tien Nguyen","orcid":"https://orcid.org/0000-0001-5781-1793"},"institutions":[{"id":"https://openalex.org/I64519617","display_name":"Hanoi University","ror":"https://ror.org/01mxx0e62","country_code":"VN","type":"education","lineage":["https://openalex.org/I64519617"]}],"countries":["VN"],"is_corresponding":false,"raw_author_name":"Dung T. Nguyen","raw_affiliation_strings":["Igh Performance Computing Center, Hanoi University of Technology, Hanoi, Vietnam"],"affiliations":[{"raw_affiliation_string":"Igh Performance Computing Center, Hanoi University of Technology, Hanoi, Vietnam","institution_ids":["https://openalex.org/I64519617"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024341956","display_name":"Hiep Chi Nguyen","orcid":null},"institutions":[{"id":"https://openalex.org/I64519617","display_name":"Hanoi University","ror":"https://ror.org/01mxx0e62","country_code":"VN","type":"education","lineage":["https://openalex.org/I64519617"]}],"countries":["VN"],"is_corresponding":false,"raw_author_name":"Hiep C. Nguyen","raw_affiliation_strings":["Igh Performance Computing Center, Hanoi University of Technology, Hanoi, Vietnam"],"affiliations":[{"raw_affiliation_string":"Igh Performance Computing Center, Hanoi University of Technology, Hanoi, Vietnam","institution_ids":["https://openalex.org/I64519617"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100651611","display_name":"Weisong Shi","orcid":"https://orcid.org/0000-0001-5864-4675"},"institutions":[{"id":"https://openalex.org/I185443292","display_name":"Wayne State University","ror":"https://ror.org/01070mq45","country_code":"US","type":"education","lineage":["https://openalex.org/I185443292"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Weisong Shi","raw_affiliation_strings":["Department of Computer Science, Wayne State University, Detroit, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Wayne State University, Detroit, USA","institution_ids":["https://openalex.org/I185443292"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101511221"],"corresponding_institution_ids":["https://openalex.org/I64519617"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.17933246,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7861330509185791},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7770271301269531},{"id":"https://openalex.org/keywords/flexibility","display_name":"Flexibility (engineering)","score":0.7435809969902039},{"id":"https://openalex.org/keywords/dashboard","display_name":"Dashboard","score":0.6765094995498657},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.6670466661453247},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.5343450903892517},{"id":"https://openalex.org/keywords/single-point-of-failure","display_name":"Single point of failure","score":0.4945225417613983},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.47955915331840515},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4147171378135681},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.2599329650402069},{"id":"https://openalex.org/keywords/software-engineering","display_name":"Software engineering","score":0.23775064945220947}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7861330509185791},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7770271301269531},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.7435809969902039},{"id":"https://openalex.org/C33499554","wikidata":"https://www.wikidata.org/wiki/Q1417134","display_name":"Dashboard","level":2,"score":0.6765094995498657},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.6670466661453247},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5343450903892517},{"id":"https://openalex.org/C165136773","wikidata":"https://www.wikidata.org/wiki/Q1363179","display_name":"Single point of failure","level":2,"score":0.4945225417613983},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.47955915331840515},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4147171378135681},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.2599329650402069},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.23775064945220947},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/rivf.2009.5174661","is_oa":false,"landing_page_url":"https://doi.org/10.1109/rivf.2009.5174661","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2009 IEEE-RIVF International Conference on Computing and Communication Technologies","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.6299999952316284,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320309272","display_name":"Wayne State University","ror":"https://ror.org/01070mq45"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W1664954303","https://openalex.org/W1974388055","https://openalex.org/W2001465248","https://openalex.org/W2044316154","https://openalex.org/W2103023238","https://openalex.org/W2104847466","https://openalex.org/W2108139438","https://openalex.org/W2111038504","https://openalex.org/W2145289499","https://openalex.org/W2148508494","https://openalex.org/W2148622484","https://openalex.org/W2148952547","https://openalex.org/W2154983209","https://openalex.org/W2155842136","https://openalex.org/W2157593310","https://openalex.org/W2158907675","https://openalex.org/W2159085330","https://openalex.org/W2171418548","https://openalex.org/W4285719527","https://openalex.org/W6681732262"],"related_works":["https://openalex.org/W2607236244","https://openalex.org/W2382083831","https://openalex.org/W4239369595","https://openalex.org/W2169792944","https://openalex.org/W3048913176","https://openalex.org/W4386891374","https://openalex.org/W4292603292","https://openalex.org/W2048767857","https://openalex.org/W2113594494","https://openalex.org/W2388889456"],"abstract_inverted_index":{"Failure":[0],"management":[1],"is":[2,49,70],"crucial":[3],"for":[4,57,72],"high":[5],"performance":[6],"computing":[7,100],"systems,":[8],"especially":[9],"when":[10,84,108],"the":[11,29,77,91,95],"complexity":[12],"of":[13,34,68,79,97],"applications":[14,81],"and":[15,32,41,61,102,118,121],"underlying":[16],"infrastructure":[17],"has":[18],"grown":[19],"sharply":[20],"in":[21],"recent":[22],"years.":[23],"In":[24,64],"this":[25],"paper,":[26],"we":[27],"present":[28],"design,":[30],"implementation":[31],"experiment":[33],"trouble":[35],"dashboard":[36],"(TD),":[37],"an":[38],"adaptive,":[39],"flexible,":[40],"low":[42,116],"overhead":[43],"failure":[44],"monitoring":[45],"system.":[46],"Our":[47],"goal":[48],"to":[50,75,93,124,126],"provide":[51],"a":[52,66],"lightweight,":[53],"scalable":[54],"failure-monitoring":[55],"tool":[56,92],"both":[58],"application":[59,73],"scientists":[60,74],"system":[62],"managers.":[63],"TD,":[65],"set":[67],"APIs":[69],"provided":[71],"control":[76],"behavior":[78],"their":[80],"with":[82],"flexibility":[83],"failures":[85,107],"happen.":[86],"System":[87],"managers":[88],"can":[89],"use":[90],"monitor":[94],"status":[96],"not":[98],"only":[99],"nodes":[101],"running":[103],"tasks":[104],"but":[105],"also":[106],"they":[109],"occur.":[110],"Experiments":[111],"show":[112],"that":[113],"TD":[114],"incurs":[115],"overhead,":[117],"remains":[119],"accurate":[120],"flexible":[122],"enough":[123],"adapt":[125],"various":[127],"applications.":[128]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
