{"id":"https://openalex.org/W7162572825","doi":"https://doi.org/10.48550/arxiv.2605.26461","title":"Characterization-Guided GPU Fault Resilience in NVIDIA MPS","display_name":"Characterization-Guided GPU Fault Resilience in NVIDIA MPS","publication_year":2026,"publication_date":"2026-05-26","ids":{"openalex":"https://openalex.org/W7162572825","doi":"https://doi.org/10.48550/arxiv.2605.26461"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.26461","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26461","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.26461","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137118878","display_name":"Rixin Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Rixin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137191316","display_name":"Xingqi Cui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cui, Xingqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011837748","display_name":"K. \u2013S. Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Kaijian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137163788","display_name":"Xinheng Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Xinheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101916543","display_name":"Zirui Liu","orcid":"https://orcid.org/0000-0001-9062-6565"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zirui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137090858","display_name":"Yuke Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yuke","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5053453103","display_name":"Jiarong Xing","orcid":"https://orcid.org/0009-0006-6163-0569"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xing, Jiarong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.6500999927520752,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.6500999927520752,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.16429999470710754,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11005","display_name":"Radiation Effects in Electronics","score":0.07620000094175339,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.5658000111579895},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5412999987602234},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.5101000070571899},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.4848000109195709},{"id":"https://openalex.org/keywords/resilience","display_name":"Resilience (materials science)","score":0.4381999969482422},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.4198000133037567},{"id":"https://openalex.org/keywords/fault","display_name":"Fault (geology)","score":0.41920000314712524},{"id":"https://openalex.org/keywords/software-fault-tolerance","display_name":"Software fault tolerance","score":0.39969998598098755}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7874000072479248},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.5658000111579895},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5412999987602234},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.5101000070571899},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.4848000109195709},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.43849998712539673},{"id":"https://openalex.org/C2779585090","wikidata":"https://www.wikidata.org/wiki/Q3457762","display_name":"Resilience (materials science)","level":2,"score":0.4381999969482422},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.4198000133037567},{"id":"https://openalex.org/C175551986","wikidata":"https://www.wikidata.org/wiki/Q47089","display_name":"Fault (geology)","level":2,"score":0.41920000314712524},{"id":"https://openalex.org/C50712370","wikidata":"https://www.wikidata.org/wiki/Q4269346","display_name":"Software fault tolerance","level":3,"score":0.39969998598098755},{"id":"https://openalex.org/C167391956","wikidata":"https://www.wikidata.org/wiki/Q1401211","display_name":"Fault model","level":3,"score":0.39010000228881836},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.37869998812675476},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.350600004196167},{"id":"https://openalex.org/C2775928411","wikidata":"https://www.wikidata.org/wiki/Q2041312","display_name":"Fault injection","level":3,"score":0.3379000127315521},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3343999981880188},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.3224000036716461},{"id":"https://openalex.org/C152745839","wikidata":"https://www.wikidata.org/wiki/Q5438153","display_name":"Fault detection and isolation","level":3,"score":0.31150001287460327},{"id":"https://openalex.org/C126953365","wikidata":"https://www.wikidata.org/wiki/Q5438152","display_name":"Fault coverage","level":3,"score":0.3005000054836273},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.2944999933242798},{"id":"https://openalex.org/C2780378061","wikidata":"https://www.wikidata.org/wiki/Q25351891","display_name":"Service (business)","level":2,"score":0.29330000281333923},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.2565000057220459},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.25290000438690186}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.26461","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26461","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.26461","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26461","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"NVIDIA":[0],"Multi-Process":[1],"Service":[2],"(MPS)":[3],"enables":[4],"fine-grained":[5],"GPU":[6,26,53,77,117],"sharing":[7],"by":[8,70,111],"allowing":[9],"multiple":[10],"processes":[11],"to":[12,62],"execute":[13],"concurrently":[14],"on":[15,89,147],"the":[16,102,115],"same":[17],"GPU,":[18],"making":[19],"it":[20],"an":[21],"important":[22],"mechanism":[23,100,134],"for":[24,101],"improving":[25],"utilization.":[27],"However,":[28],"MPS":[29,61],"has":[30],"weak":[31],"fault":[32,35,98],"resilience:":[33],"a":[34,73,80,132],"in":[36,47,114],"one":[37],"process":[38,125],"can":[39,107,156],"terminate":[40],"all":[41],"co-running":[42],"processes,":[43],"limiting":[44],"its":[45],"adoption":[46],"resilience-critical":[48],"settings":[49],"such":[50],"as":[51],"multi-tenant":[52],"clusters.":[54],"In":[55],"this":[56,64],"work,":[57],"we":[58,92,130],"design":[59,67,93,131],"fault-resilient":[60],"solve":[63],"problem.":[65],"Our":[66,145],"is":[68,126],"guided":[69],"insights":[71],"from":[72],"systematic":[74],"characterization":[75],"of":[76,83],"faults":[78,105,123,159],"and":[79,150],"deep":[81],"analysis":[82],"their":[84],"end-to-end":[85],"processing":[86],"pipeline.":[87],"Based":[88],"these":[90,154],"insights,":[91],"two":[94],"complementary":[95],"mechanisms.":[96],"A":[97],"isolation":[99],"dominant":[103],"memory-related":[104],"that":[106,153],"be":[108],"fully":[109],"isolated":[110],"software":[112],"intervention":[113],"open":[116],"driver":[118],"kernel":[119],"module.":[120],"For":[121],"other":[122],"whose":[124],"within":[127],"proprietary":[128],"software,":[129],"practical":[133],"--":[135],"fast":[136],"recovery":[137],"using":[138],"virtual":[139],"memory":[140],"based":[141],"GPU-resident":[142],"state":[143],"sharing.":[144],"evaluation":[146],"different":[148],"GPUs":[149],"workloads":[151],"shows":[152],"mechanisms":[155],"handle":[157],"corresponding":[158],"effectively":[160],"with":[161],"minimal":[162],"overhead.":[163]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-28T00:00:00"}
