{"id":"https://openalex.org/W7133338558","doi":"https://doi.org/10.48550/arxiv.2603.02057","title":"Beyond Microservices: Testing Web-Scale RCA Methods on GPU-Driven LLM Workloads","display_name":"Beyond Microservices: Testing Web-Scale RCA Methods on GPU-Driven LLM Workloads","publication_year":2026,"publication_date":"2026-03-02","ids":{"openalex":"https://openalex.org/W7133338558","doi":"https://doi.org/10.48550/arxiv.2603.02057"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.02057","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02057","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.02057","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024675213","display_name":"Dominik Scheinert","orcid":"https://orcid.org/0000-0003-0763-3233"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Scheinert, Dominik","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036894016","display_name":"Alexander Acker","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Acker, Alexander","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127885821","display_name":"Thorsten Wittkopp","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wittkopp, Thorsten","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Becker, Soeren","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Becker, Soeren","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049979509","display_name":"Hamza Yous","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yous, Hamza","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134423536","display_name":"Karnakar Reddy","orcid":"https://orcid.org/0009-0001-5699-4089"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Reddy, Karnakar","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128023995","display_name":"Ibrahim Farhat","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Farhat, Ibrahim","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127984234","display_name":"Hakim Hacid","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hacid, Hakim","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5119909913","display_name":"Odej Kao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kao, Odej","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5024675213"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.979200005531311,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.979200005531311,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.0019000000320374966,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.001500000013038516,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.5575000047683716},{"id":"https://openalex.org/keywords/root-cause-analysis","display_name":"Root cause analysis","score":0.5328999757766724},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.515500009059906},{"id":"https://openalex.org/keywords/tracing","display_name":"Tracing","score":0.5098000168800354},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.4553999900817871},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.44850000739097595},{"id":"https://openalex.org/keywords/root-cause","display_name":"Root cause","score":0.4481000006198883},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.41929998993873596},{"id":"https://openalex.org/keywords/service","display_name":"Service (business)","score":0.40869998931884766}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7531999945640564},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.5590999722480774},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.5575000047683716},{"id":"https://openalex.org/C130963320","wikidata":"https://www.wikidata.org/wiki/Q1401207","display_name":"Root cause analysis","level":2,"score":0.5328999757766724},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.515500009059906},{"id":"https://openalex.org/C138673069","wikidata":"https://www.wikidata.org/wiki/Q322229","display_name":"Tracing","level":2,"score":0.5098000168800354},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.4553999900817871},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.44850000739097595},{"id":"https://openalex.org/C84945661","wikidata":"https://www.wikidata.org/wiki/Q7366567","display_name":"Root cause","level":2,"score":0.4481000006198883},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.41929998993873596},{"id":"https://openalex.org/C2780378061","wikidata":"https://www.wikidata.org/wiki/Q25351891","display_name":"Service (business)","level":2,"score":0.40869998931884766},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.39739999175071716},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3878999948501587},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.357699990272522},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.35690000653266907},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3564000129699707},{"id":"https://openalex.org/C5119721","wikidata":"https://www.wikidata.org/wiki/Q220501","display_name":"Quality of service","level":2,"score":0.35040000081062317},{"id":"https://openalex.org/C35578498","wikidata":"https://www.wikidata.org/wiki/Q193424","display_name":"Web service","level":2,"score":0.34369999170303345},{"id":"https://openalex.org/C117447612","wikidata":"https://www.wikidata.org/wiki/Q1412670","display_name":"Software quality","level":4,"score":0.34060001373291016},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.3328000009059906},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.31709998846054077},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3109000027179718},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3091000020503998},{"id":"https://openalex.org/C107094494","wikidata":"https://www.wikidata.org/wiki/Q428453","display_name":"Fault tree analysis","level":2,"score":0.29809999465942383},{"id":"https://openalex.org/C118643609","wikidata":"https://www.wikidata.org/wiki/Q189210","display_name":"Web application","level":2,"score":0.29429998993873596},{"id":"https://openalex.org/C152745839","wikidata":"https://www.wikidata.org/wiki/Q5438153","display_name":"Fault detection and isolation","level":3,"score":0.2939000129699707},{"id":"https://openalex.org/C174683762","wikidata":"https://www.wikidata.org/wiki/Q609588","display_name":"Component-based software engineering","level":4,"score":0.2824000120162964},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.2782000005245209},{"id":"https://openalex.org/C2780898871","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Performance metric","level":2,"score":0.2718999981880188},{"id":"https://openalex.org/C176553487","wikidata":"https://www.wikidata.org/wiki/Q7855819","display_name":"Turnaround time","level":2,"score":0.27160000801086426},{"id":"https://openalex.org/C82214349","wikidata":"https://www.wikidata.org/wiki/Q657339","display_name":"Software metric","level":5,"score":0.2655999958515167},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.25600001215934753},{"id":"https://openalex.org/C148027188","wikidata":"https://www.wikidata.org/wiki/Q907375","display_name":"Unit testing","level":3,"score":0.2547999918460846},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.25369998812675476}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.02057","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02057","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.02057","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02057","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"model":[2],"(LLM)":[3],"services":[4],"have":[5,106,126],"become":[6],"an":[7],"integral":[8],"part":[9],"of":[10,32,55,145],"search,":[11],"assistance,":[12],"and":[13,24,35,53,80,97,118,165,182,204],"decision-making":[14],"applications.":[15],"However,":[16],"unlike":[17],"traditional":[18,110],"web":[19],"or":[20],"microservices,":[21],"the":[22,75,92,100,143,174],"hardware":[23],"software":[25],"stack":[26],"enabling":[27],"LLM":[28,131,151,198],"inference":[29,152],"deployment":[30,153],"is":[31,70,77],"higher":[33],"complexity":[34],"far":[36],"less":[37],"field-tested,":[38],"making":[39],"it":[40],"more":[41],"susceptible":[42],"to":[43,48,65,197],"failures":[44],"that":[45,95,133,170,190],"are":[46],"difficult":[47],"resolve.":[49],"Keeping":[50],"outage":[51],"costs":[52],"quality":[54],"service":[56],"degradations":[57],"in":[58,68],"check":[59],"depends":[60],"on":[61,148],"shortening":[62],"mean":[63],"time":[64],"repair,":[66],"which":[67,208],"practice":[69],"gated":[71],"by":[72,90],"how":[73,99],"quickly":[74],"fault":[76],"identified,":[78],"located,":[79],"diagnosed.":[81],"Automated":[82],"root":[83],"cause":[84],"analysis":[85,202],"(RCA)":[86],"accelerates":[87],"failure":[88,101,156],"localization":[89],"identifying":[91],"system":[93],"component":[94],"failed":[96],"tracing":[98],"propagated.":[102],"Numerous":[103],"RCA":[104,124,146,192],"methods":[105,125,147,160,178,184],"been":[107,128],"developed":[108],"for":[109,130,207],"services,":[111],"using":[112],"request":[113],"path":[114],"tracing,":[115],"resource":[116],"metric":[117],"log":[119],"data":[120],"analysis.":[121],"Yet,":[122],"existing":[123,191],"not":[127,195],"designed":[129],"deployments":[132],"present":[134],"distinct":[135],"runtime":[136],"characteristics.":[137],"In":[138],"this":[139],"study,":[140],"we":[141,168,209],"evaluate":[142],"effectiveness":[144],"a":[149],"best-practice":[150],"under":[154],"controlled":[155],"injections.":[157],"Across":[158],"24":[159],"(20":[161],"metric-based,":[162],"two":[163,166],"trace-based,":[164],"multi-source),":[167],"find":[169],"multi-source":[171],"approaches":[172],"achieve":[173],"highest":[175],"accuracy,":[176],"metric-based":[177],"show":[179],"fault-type-dependent":[180],"performance,":[181],"trace-based":[183],"largely":[185],"fail.":[186],"These":[187],"results":[188],"reveal":[189],"tools":[193],"do":[194],"generalize":[196],"systems,":[199],"motivating":[200],"tailored":[201],"techniques":[203],"enhanced":[205],"observability,":[206],"formulate":[210],"guidelines.":[211]},"counts_by_year":[],"updated_date":"2026-05-03T08:25:01.440150","created_date":"2026-03-04T00:00:00"}
