{"id":"https://openalex.org/W7147594451","doi":"https://doi.org/10.48550/arxiv.2603.28781","title":"When GPUs Fail Quietly: Observability-Aware Early Warning Beyond Numeric Telemetry","display_name":"When GPUs Fail Quietly: Observability-Aware Early Warning Beyond Numeric Telemetry","publication_year":2026,"publication_date":"2026-03-17","ids":{"openalex":"https://openalex.org/W7147594451","doi":"https://doi.org/10.48550/arxiv.2603.28781"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.28781","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.28781","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.28781","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132557225","display_name":"Michael Bidollahkhani","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bidollahkhani, Michael","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132716507","display_name":"Freja Nordsiek","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nordsiek, Freja","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132690324","display_name":"Julian M. Kunkel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kunkel, Julian M.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.2012999951839447,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.2012999951839447,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11005","display_name":"Radiation Effects in Electronics","score":0.19439999759197235,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10472","display_name":"Semiconductor materials and devices","score":0.18160000443458557,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/telemetry","display_name":"Telemetry","score":0.7394999861717224},{"id":"https://openalex.org/keywords/payload","display_name":"Payload (computing)","score":0.7017999887466431},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5400000214576721},{"id":"https://openalex.org/keywords/interconnection","display_name":"Interconnection","score":0.5307000279426575},{"id":"https://openalex.org/keywords/observable","display_name":"Observable","score":0.5250999927520752},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.4203000068664551},{"id":"https://openalex.org/keywords/degradation","display_name":"Degradation (telecommunications)","score":0.41589999198913574},{"id":"https://openalex.org/keywords/warning-system","display_name":"Warning system","score":0.3855000138282776}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7598999738693237},{"id":"https://openalex.org/C183121708","wikidata":"https://www.wikidata.org/wiki/Q209867","display_name":"Telemetry","level":2,"score":0.7394999861717224},{"id":"https://openalex.org/C134066672","wikidata":"https://www.wikidata.org/wiki/Q1424639","display_name":"Payload (computing)","level":3,"score":0.7017999887466431},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.5600000023841858},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5400000214576721},{"id":"https://openalex.org/C123745756","wikidata":"https://www.wikidata.org/wiki/Q1665949","display_name":"Interconnection","level":2,"score":0.5307000279426575},{"id":"https://openalex.org/C32848918","wikidata":"https://www.wikidata.org/wiki/Q845789","display_name":"Observable","level":2,"score":0.5250999927520752},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.4203000068664551},{"id":"https://openalex.org/C2779679103","wikidata":"https://www.wikidata.org/wiki/Q5251805","display_name":"Degradation (telecommunications)","level":2,"score":0.41589999198913574},{"id":"https://openalex.org/C29825287","wikidata":"https://www.wikidata.org/wiki/Q1427940","display_name":"Warning system","level":2,"score":0.3855000138282776},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.34139999747276306},{"id":"https://openalex.org/C204530211","wikidata":"https://www.wikidata.org/wiki/Q752823","display_name":"Thermal","level":2,"score":0.32589998841285706},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.31610000133514404},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.3156999945640564},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.31130000948905945},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.3052999973297119},{"id":"https://openalex.org/C104267543","wikidata":"https://www.wikidata.org/wiki/Q208163","display_name":"Signal processing","level":3,"score":0.2913999855518341},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.28220000863075256},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.26820001006126404},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.2623000144958496},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.2572999894618988},{"id":"https://openalex.org/C2779296788","wikidata":"https://www.wikidata.org/wiki/Q5326904","display_name":"Early warning system","level":3,"score":0.2563999891281128},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.25619998574256897},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.2538999915122986},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.25290000438690186}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.28781","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.28781","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.28781","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.28781","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"GPU":[0,89,116],"nodes":[1,117],"are":[2,140],"central":[3],"to":[4,155],"modern":[5],"HPC":[6],"and":[7,55,67,91,105,124,139],"AI":[8],"workloads,":[9],"yet":[10],"many":[11],"failures":[12,134],"do":[13],"not":[14],"manifest":[15],"as":[16,25,97],"immediate":[17],"hard":[18],"faults.":[19],"While":[20],"some":[21],"instabilities":[22],"emerge":[23],"gradually":[24],"weak":[26],"thermal":[27,85],"or":[28,38,52],"efficiency":[29],"drift,":[30],"a":[31],"significant":[32],"class":[33],"occurs":[34],"abruptly":[35],"with":[36],"little":[37],"no":[39],"numeric":[40,137],"precursor.":[41],"In":[42],"these":[43],"detachment-class":[44],"failures,":[45],"GPUs":[46],"become":[47],"unavailable":[48],"at":[49,118,167],"the":[50,56],"driver":[51],"interconnect":[53],"level":[54],"dominant":[57],"observable":[58,142],"signal":[59],"is":[60,110,164],"structural,":[61],"including":[62],"disappearance":[63],"of":[64,69],"device":[65],"metrics":[66],"degradation":[68,94],"monitoring":[70],"payload":[71],"integrity.":[72],"This":[73],"paper":[74],"proposes":[75],"an":[76],"observability-aware":[77],"early-warning":[78,151],"framework":[79,109],"that":[80,132],"jointly":[81],"models":[82],"(i)":[83],"utilization-aware":[84],"drift":[86],"signatures":[87],"in":[88,161],"telemetry":[90,114,145],"(ii)":[92],"monitoring-pipeline":[93],"indicators":[95],"such":[96],"scrape":[98],"latency":[99],"increase,":[100],"sample":[101],"loss,":[102],"time-series":[103],"gaps,":[104],"device-metric":[106],"disappearance.":[107],"The":[108,158],"evaluated":[111],"on":[112],"production":[113],"from":[115],"GWDG,":[119],"where":[120],"GPU,":[121],"node,":[122],"monitoring,":[123],"scheduler":[125],"signals":[126],"can":[127],"be":[128],"correlated.":[129],"Results":[130],"show":[131],"detachment":[133],"exhibit":[135],"minimal":[136],"precursor":[138],"primarily":[141],"through":[143],"structural":[144],"collapse,":[146],"while":[147],"joint":[148],"modeling":[149],"increases":[150],"lead":[152],"time":[153],"compared":[154],"GPU-only":[156],"detection.":[157],"dataset":[159],"used":[160],"this":[162],"study":[163],"publicly":[165],"available":[166],"https://doi.org/10.5281/zenodo.19052367.":[168]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-02T00:00:00"}
