{"id":"https://openalex.org/W7155363704","doi":"https://doi.org/10.1145/3777911.3800637","title":"Beyond Microservices: Testing Web-Scale RCA Methods on GPU-Driven LLM Workloads","display_name":"Beyond Microservices: Testing Web-Scale RCA Methods on GPU-Driven LLM Workloads","publication_year":2026,"publication_date":"2026-04-23","ids":{"openalex":"https://openalex.org/W7155363704","doi":"https://doi.org/10.1145/3777911.3800637"},"language":null,"primary_location":{"id":"doi:10.1145/3777911.3800637","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3777911.3800637","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion of the 17th ACM/SPEC International Conference on Performance Engineering","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3777911.3800637","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024675213","display_name":"Dominik Scheinert","orcid":"https://orcid.org/0000-0003-0763-3233"},"institutions":[{"id":"https://openalex.org/I4210149610","display_name":"Keysight Technologies (United Kingdom)","ror":"https://ror.org/04g81mm64","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210115805","https://openalex.org/I4210149610"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Dominik Scheinert","raw_affiliation_strings":["logsight.ai GmbH, Berlin, Germany"],"raw_orcid":"https://orcid.org/0000-0003-0763-3233","affiliations":[{"raw_affiliation_string":"logsight.ai GmbH, Berlin, Germany","institution_ids":["https://openalex.org/I4210149610"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036894016","display_name":"Alexander Acker","orcid":null},"institutions":[{"id":"https://openalex.org/I4210149610","display_name":"Keysight Technologies (United Kingdom)","ror":"https://ror.org/04g81mm64","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210115805","https://openalex.org/I4210149610"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Alexander Acker","raw_affiliation_strings":["logsight.ai GmbH, Berlin, Germany"],"raw_orcid":"https://orcid.org/0000-0002-0108-3034","affiliations":[{"raw_affiliation_string":"logsight.ai GmbH, Berlin, Germany","institution_ids":["https://openalex.org/I4210149610"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086364217","display_name":"Thorsten Wittkopp","orcid":"https://orcid.org/0000-0001-5154-7813"},"institutions":[{"id":"https://openalex.org/I4577782","display_name":"Technische Universit\u00e4t Berlin","ror":"https://ror.org/03v4gjf40","country_code":"DE","type":"education","lineage":["https://openalex.org/I4577782"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Thorsten Wittkopp","raw_affiliation_strings":["Technische Universit\u00e4t Berlin, Berlin, Germany"],"raw_orcid":"https://orcid.org/0000-0001-5154-7813","affiliations":[{"raw_affiliation_string":"Technische Universit\u00e4t Berlin, Berlin, Germany","institution_ids":["https://openalex.org/I4577782"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113101499","display_name":"Soeren Becker","orcid":null},"institutions":[{"id":"https://openalex.org/I4210149610","display_name":"Keysight Technologies (United Kingdom)","ror":"https://ror.org/04g81mm64","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210115805","https://openalex.org/I4210149610"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Soeren Becker","raw_affiliation_strings":["logsight.ai GmbH, Berlin, Germany"],"raw_orcid":"https://orcid.org/0000-0001-6487-1268","affiliations":[{"raw_affiliation_string":"logsight.ai GmbH, Berlin, Germany","institution_ids":["https://openalex.org/I4210149610"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049979509","display_name":"Hamza Yous","orcid":null},"institutions":[{"id":"https://openalex.org/I4210087059","display_name":"Technology Innovation Institute","ror":"https://ror.org/001kv2y39","country_code":"AE","type":"facility","lineage":["https://openalex.org/I4210087059"]}],"countries":["AE"],"is_corresponding":false,"raw_author_name":"Hamza Yous","raw_affiliation_strings":["Technology Innovation Institute, Abu Dhabi, United Arab Emirates"],"raw_orcid":"https://orcid.org/0000-0003-2078-0668","affiliations":[{"raw_affiliation_string":"Technology Innovation Institute, Abu Dhabi, United Arab Emirates","institution_ids":["https://openalex.org/I4210087059"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134423536","display_name":"Karnakar Reddy","orcid":"https://orcid.org/0009-0001-5699-4089"},"institutions":[{"id":"https://openalex.org/I4210087059","display_name":"Technology Innovation Institute","ror":"https://ror.org/001kv2y39","country_code":"AE","type":"facility","lineage":["https://openalex.org/I4210087059"]}],"countries":["AE"],"is_corresponding":false,"raw_author_name":"Karnakar Reddy","raw_affiliation_strings":["Technology Innovation Institute, Abu Dhabi, United Arab Emirates"],"raw_orcid":"https://orcid.org/0009-0001-5699-4089","affiliations":[{"raw_affiliation_string":"Technology Innovation Institute, Abu Dhabi, United Arab Emirates","institution_ids":["https://openalex.org/I4210087059"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128023995","display_name":"Ibrahim Farhat","orcid":null},"institutions":[{"id":"https://openalex.org/I4210087059","display_name":"Technology Innovation Institute","ror":"https://ror.org/001kv2y39","country_code":"AE","type":"facility","lineage":["https://openalex.org/I4210087059"]}],"countries":["AE"],"is_corresponding":false,"raw_author_name":"Ibrahim Farhat","raw_affiliation_strings":["Technology Innovation Institute, Abu Dhabi, United Arab Emirates"],"raw_orcid":"https://orcid.org/0000-0002-5478-7799","affiliations":[{"raw_affiliation_string":"Technology Innovation Institute, Abu Dhabi, United Arab Emirates","institution_ids":["https://openalex.org/I4210087059"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013316111","display_name":"Hakim Hacid","orcid":"https://orcid.org/0000-0003-2265-9343"},"institutions":[{"id":"https://openalex.org/I4210087059","display_name":"Technology Innovation Institute","ror":"https://ror.org/001kv2y39","country_code":"AE","type":"facility","lineage":["https://openalex.org/I4210087059"]}],"countries":["AE"],"is_corresponding":false,"raw_author_name":"Hakim Hacid","raw_affiliation_strings":["Technology Innovation Institute, Abu Dhabi, United Arab Emirates"],"raw_orcid":"https://orcid.org/0000-0003-2265-9343","affiliations":[{"raw_affiliation_string":"Technology Innovation Institute, Abu Dhabi, United Arab Emirates","institution_ids":["https://openalex.org/I4210087059"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5134442818","display_name":"Odej Kao","orcid":"https://orcid.org/0000-0001-6454-6799"},"institutions":[{"id":"https://openalex.org/I4577782","display_name":"Technische Universit\u00e4t Berlin","ror":"https://ror.org/03v4gjf40","country_code":"DE","type":"education","lineage":["https://openalex.org/I4577782"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Odej Kao","raw_affiliation_strings":["Technische Universit\u00e4t Berlin, Berlin, Germany"],"raw_orcid":"https://orcid.org/0000-0001-6454-6799","affiliations":[{"raw_affiliation_string":"Technische Universit\u00e4t Berlin, Berlin, Germany","institution_ids":["https://openalex.org/I4577782"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5024675213"],"corresponding_institution_ids":["https://openalex.org/I4210149610"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.88404301,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"163","last_page":"172"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.9800999760627747,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.9800999760627747,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.0019000000320374966,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.00139999995008111,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.5613999962806702},{"id":"https://openalex.org/keywords/root-cause-analysis","display_name":"Root cause analysis","score":0.5315999984741211},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.5162000060081482},{"id":"https://openalex.org/keywords/tracing","display_name":"Tracing","score":0.5109000205993652},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.45320001244544983},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.45100000500679016},{"id":"https://openalex.org/keywords/root-cause","display_name":"Root cause","score":0.44929999113082886},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.41830000281333923},{"id":"https://openalex.org/keywords/service","display_name":"Service (business)","score":0.4088999927043915},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.39010000228881836}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7501999735832214},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.5613999962806702},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.5612000226974487},{"id":"https://openalex.org/C130963320","wikidata":"https://www.wikidata.org/wiki/Q1401207","display_name":"Root cause analysis","level":2,"score":0.5315999984741211},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.5162000060081482},{"id":"https://openalex.org/C138673069","wikidata":"https://www.wikidata.org/wiki/Q322229","display_name":"Tracing","level":2,"score":0.5109000205993652},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.45320001244544983},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.45100000500679016},{"id":"https://openalex.org/C84945661","wikidata":"https://www.wikidata.org/wiki/Q7366567","display_name":"Root cause","level":2,"score":0.44929999113082886},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.41830000281333923},{"id":"https://openalex.org/C2780378061","wikidata":"https://www.wikidata.org/wiki/Q25351891","display_name":"Service (business)","level":2,"score":0.4088999927043915},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.39649999141693115},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.39010000228881836},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.36090001463890076},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.35679998993873596},{"id":"https://openalex.org/C5119721","wikidata":"https://www.wikidata.org/wiki/Q220501","display_name":"Quality of service","level":2,"score":0.35429999232292175},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3522000014781952},{"id":"https://openalex.org/C35578498","wikidata":"https://www.wikidata.org/wiki/Q193424","display_name":"Web service","level":2,"score":0.3452000021934509},{"id":"https://openalex.org/C117447612","wikidata":"https://www.wikidata.org/wiki/Q1412670","display_name":"Software quality","level":4,"score":0.34279999136924744},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.3352999985218048},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.32109999656677246},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.31380000710487366},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3100000023841858},{"id":"https://openalex.org/C107094494","wikidata":"https://www.wikidata.org/wiki/Q428453","display_name":"Fault tree analysis","level":2,"score":0.3000999987125397},{"id":"https://openalex.org/C118643609","wikidata":"https://www.wikidata.org/wiki/Q189210","display_name":"Web application","level":2,"score":0.29490000009536743},{"id":"https://openalex.org/C152745839","wikidata":"https://www.wikidata.org/wiki/Q5438153","display_name":"Fault detection and isolation","level":3,"score":0.29409998655319214},{"id":"https://openalex.org/C174683762","wikidata":"https://www.wikidata.org/wiki/Q609588","display_name":"Component-based software engineering","level":4,"score":0.28299999237060547},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.2816999852657318},{"id":"https://openalex.org/C2780898871","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Performance metric","level":2,"score":0.27250000834465027},{"id":"https://openalex.org/C82214349","wikidata":"https://www.wikidata.org/wiki/Q657339","display_name":"Software metric","level":5,"score":0.2671000063419342},{"id":"https://openalex.org/C176553487","wikidata":"https://www.wikidata.org/wiki/Q7855819","display_name":"Turnaround time","level":2,"score":0.26600000262260437},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.2596000134944916},{"id":"https://openalex.org/C148027188","wikidata":"https://www.wikidata.org/wiki/Q907375","display_name":"Unit testing","level":3,"score":0.2551000118255615},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.2538999915122986},{"id":"https://openalex.org/C175551986","wikidata":"https://www.wikidata.org/wiki/Q47089","display_name":"Fault (geology)","level":2,"score":0.251800000667572}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3777911.3800637","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3777911.3800637","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion of the 17th ACM/SPEC International Conference on Performance Engineering","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3777911.3800637","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3777911.3800637","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion of the 17th ACM/SPEC International Conference on Performance Engineering","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W1607114662","https://openalex.org/W2141992894","https://openalex.org/W2181567306","https://openalex.org/W2358431938","https://openalex.org/W2913603385","https://openalex.org/W2930500175","https://openalex.org/W3092126302","https://openalex.org/W3198081460","https://openalex.org/W4200347562","https://openalex.org/W4372338130","https://openalex.org/W4384345635","https://openalex.org/W4386378095","https://openalex.org/W4399367209","https://openalex.org/W4400033172","https://openalex.org/W4400582267","https://openalex.org/W4402593721","https://openalex.org/W4403582420","https://openalex.org/W4409248832","https://openalex.org/W4410636905"],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"model":[2],"(LLM)":[3],"services":[4],"have":[5,106,126],"become":[6],"an":[7],"integral":[8],"part":[9],"of":[10,32,55,145],"search,":[11],"assistance,":[12],"and":[13,24,35,53,80,97,118,164,180,202],"decision-making":[14],"applications.":[15],"However,":[16],"unlike":[17],"traditional":[18,110],"web":[19],"or":[20],"microservices,":[21],"the":[22,75,92,100,143,172],"hardware":[23],"software":[25],"stack":[26],"enabling":[27],"LLM":[28,131,151,196],"inference":[29,152],"deployment":[30,153],"is":[31,70,77],"higher":[33],"complexity":[34],"far":[36],"less":[37],"field-tested,":[38],"making":[39],"it":[40],"more":[41],"susceptible":[42],"to":[43,48,65,195],"failures":[44],"that":[45,95,133,168,188],"are":[46],"difficult":[47],"resolve.":[49],"Keeping":[50],"outage":[51],"costs":[52],"quality":[54],"service":[56],"degradations":[57],"in":[58,68],"check":[59],"depends":[60],"on":[61,148],"shortening":[62],"mean":[63],"time":[64],"repair,":[66],"which":[67,206],"practice":[69],"gated":[71],"by":[72,90],"how":[73,99],"quickly":[74],"fault":[76],"identified,":[78],"located,":[79],"diagnosed.":[81],"Automated":[82],"root":[83],"cause":[84],"analysis":[85,200],"(RCA)":[86],"accelerates":[87],"failure":[88,101,156],"localization":[89],"identifying":[91],"system":[93],"component":[94],"failed":[96],"tracing":[98],"propagated.":[102],"Numerous":[103],"RCA":[104,124,146,190],"methods":[105,125,147,176,182],"been":[107,128],"developed":[108],"for":[109,130,205],"services,":[111],"using":[112],"request":[113],"path":[114],"tracing,":[115],"resource":[116],"metric":[117],"log":[119],"data":[120],"analysis.":[121],"Yet,":[122],"existing":[123,189],"not":[127,193],"designed":[129],"deployments":[132],"present":[134],"distinct":[135],"runtime":[136],"characteristics.":[137],"In":[138],"this":[139],"study,":[140],"we":[141,207],"evaluate":[142],"effectiveness":[144],"a":[149],"best-practice":[150],"under":[154],"controlled":[155],"injections.":[157],"Across":[158],"24":[159],"methods\u201420":[160],"metric-based,":[161],"two":[162,165],"trace-based,":[163],"multi-source\u2014we":[166],"find":[167],"multi-source":[169],"approaches":[170],"achieve":[171],"highest":[173],"accuracy,":[174],"metric-based":[175],"show":[177],"fault-type-dependent":[178],"performance,":[179],"trace-based":[181],"largely":[183],"fail.":[184],"These":[185],"results":[186],"reveal":[187],"tools":[191],"do":[192],"generalize":[194],"systems,":[197],"motivating":[198],"tailored":[199],"techniques":[201],"enhanced":[203],"observability,":[204],"formulate":[208],"guidelines.":[209]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-04-24T00:00:00"}
