{"id":"https://openalex.org/W7160967038","doi":"https://doi.org/10.48550/arxiv.2605.10670","title":"Surviving Partial Rank Failures in Wide Expert-Parallel MoE Inference","display_name":"Surviving Partial Rank Failures in Wide Expert-Parallel MoE Inference","publication_year":2026,"publication_date":"2026-05-11","ids":{"openalex":"https://openalex.org/W7160967038","doi":"https://doi.org/10.48550/arxiv.2605.10670"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.10670","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.10670","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.10670","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135952574","display_name":"Xun Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Xun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102307049","display_name":"Shaoyuan Chen","orcid":"https://orcid.org/0000-0003-3526-3241"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Shaoyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101728918","display_name":"Pingchuan Ma","orcid":"https://orcid.org/0000-0001-7680-2817"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Pingchuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135948594","display_name":"Yue Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102441532","display_name":"Ziwei Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Ziwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135946227","display_name":"Zhanhao Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Zhanhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110611693","display_name":"Han Han","orcid":"https://orcid.org/0000-0002-4511-4007"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Han","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036856464","display_name":"Shangming Cai","orcid":"https://orcid.org/0000-0002-0902-7774"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Shangming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135948784","display_name":"Teng Ma","orcid":"https://orcid.org/0000-0002-7328-2040"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Teng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038868654","display_name":"Xuchun Shang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shang, Xuchun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128271930","display_name":"Xinpeng Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Xinpeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135957950","display_name":"Ke Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Ke","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135923648","display_name":"Junlin Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Junlin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135913567","display_name":"Lianzhi Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Lianzhi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135951438","display_name":"Yuji Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yuji","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135930871","display_name":"Feng Ren","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ren, Feng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135925954","display_name":"Haoran Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Haoran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101706154","display_name":"Cheng Wan","orcid":"https://orcid.org/0000-0002-9818-0139"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wan, Cheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044289331","display_name":"Yingdi Shan","orcid":"https://orcid.org/0009-0001-5019-8305"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shan, Yingdi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135937974","display_name":"Yongwei Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yongwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135993410","display_name":"Mingxing Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Mingxing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":21,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.43299999833106995,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.43299999833106995,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.09719999879598618,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.08940000087022781,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5134999752044678},{"id":"https://openalex.org/keywords/bounded-function","display_name":"Bounded function","score":0.48500001430511475},{"id":"https://openalex.org/keywords/rank","display_name":"Rank (graph theory)","score":0.4733000099658966},{"id":"https://openalex.org/keywords/downtime","display_name":"Downtime","score":0.46389999985694885},{"id":"https://openalex.org/keywords/communication-source","display_name":"Communication source","score":0.44830000400543213},{"id":"https://openalex.org/keywords/skew","display_name":"Skew","score":0.4357999861240387},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.426800012588501},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.36239999532699585},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.3515999913215637}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6815000176429749},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5134999752044678},{"id":"https://openalex.org/C34388435","wikidata":"https://www.wikidata.org/wiki/Q2267362","display_name":"Bounded function","level":2,"score":0.48500001430511475},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.4733000099658966},{"id":"https://openalex.org/C180591934","wikidata":"https://www.wikidata.org/wiki/Q1253369","display_name":"Downtime","level":2,"score":0.46389999985694885},{"id":"https://openalex.org/C198104137","wikidata":"https://www.wikidata.org/wiki/Q974688","display_name":"Communication source","level":2,"score":0.44830000400543213},{"id":"https://openalex.org/C43711488","wikidata":"https://www.wikidata.org/wiki/Q7534783","display_name":"Skew","level":2,"score":0.4357999861240387},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.426800012588501},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.38440001010894775},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.36239999532699585},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.3515999913215637},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.3465000092983246},{"id":"https://openalex.org/C68387754","wikidata":"https://www.wikidata.org/wiki/Q7271585","display_name":"Schedule","level":2,"score":0.3416999876499176},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.3231000006198883},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.32269999384880066},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.313400000333786},{"id":"https://openalex.org/C204854418","wikidata":"https://www.wikidata.org/wiki/Q1362921","display_name":"Polling","level":2,"score":0.311599999666214},{"id":"https://openalex.org/C21308566","wikidata":"https://www.wikidata.org/wiki/Q7169365","display_name":"Permutation (music)","level":2,"score":0.30730000138282776},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3068000078201294},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.30469998717308044},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.29589998722076416},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.2833000123500824},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.2825999855995178},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.27790001034736633},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.2752000093460083},{"id":"https://openalex.org/C46743427","wikidata":"https://www.wikidata.org/wiki/Q1341685","display_name":"Inference engine","level":3,"score":0.2689000070095062},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.2669000029563904},{"id":"https://openalex.org/C33762810","wikidata":"https://www.wikidata.org/wiki/Q461671","display_name":"Data integrity","level":2,"score":0.2662000060081482},{"id":"https://openalex.org/C136643341","wikidata":"https://www.wikidata.org/wiki/Q1361526","display_name":"Reachability","level":2,"score":0.2547000050544739},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.25209999084472656},{"id":"https://openalex.org/C16311509","wikidata":"https://www.wikidata.org/wiki/Q4148050","display_name":"Dependency graph","level":3,"score":0.25049999356269836}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.10670","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.10670","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.10670","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.10670","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.4711127281188965,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Mixture-of-Experts":[0],"(MoE)":[1],"serving":[2,183],"relies":[3],"on":[4,34],"wide":[5],"expert":[6,79,158],"parallelism":[7],"(EP)":[8],"to":[9,96,172,254],"aggregate":[10],"the":[11,51,82,91,103,140,153,207,258],"memory":[12],"capacity":[13],"and":[14,37,81,127,164,188,196,246,251],"bandwidth":[15],"of":[16,214,257],"many":[17],"GPUs":[18],"within":[19,212,255,261],"one":[20,46],"inference":[21],"instance.":[22],"This":[23,106],"efficiency":[24],"comes":[25],"with":[26,93,186],"a":[27,66,99,117,125,145,161,215,224,236,264],"systems":[28],"cost:":[29],"every":[30],"decoding":[31],"step":[32],"depends":[33],"token":[35],"dispatch":[36],"combination":[38],"across":[39],"all":[40],"active":[41],"EP":[42,55,119,182],"ranks,":[43],"so":[44],"even":[45],"rank":[47,74,197,226],"failure":[48,100,194,238],"can":[49],"disrupt":[50],"entire":[52],"service.":[53],"Existing":[54],"stacks":[56],"handle":[57],"such":[58],"failures":[59],"poorly":[60],"because":[61],"they":[62],"treat":[63],"membership":[64,132,205],"as":[65,116,133],"fixed":[67],"configuration":[68],"established":[69],"at":[70],"initialization.":[71],"The":[72,199],"same":[73],"set":[75],"determines":[76],"communicator":[77],"state,":[78],"placement,":[80],"routing":[83],"metadata":[84],"baked":[85],"into":[86,231],"CUDA":[87,175],"execution":[88],"graphs,":[89],"leaving":[90],"system":[92],"no":[94],"way":[95],"shrink":[97],"around":[98],"while":[101,222],"keeping":[102],"instance":[104],"valid.":[105],"paper":[107],"argues":[108],"that":[109,130,202],"partial-failure":[110],"tolerance":[111],"should":[112],"instead":[113],"be":[114],"formulated":[115],"live":[118],"validity":[120],"problem.":[121],"We":[122,177],"present":[123],"EEP,":[124],"communication":[126,154],"runtime":[128,136],"substrate":[129],"represents":[131],"explicit,":[134],"mutable":[135,204],"state.":[137],"EEP":[138,179,240],"repairs":[139,156],"specific":[141],"state":[142],"invalidated":[143],"by":[144],"fault:":[146],"it":[147,190],"restores":[148,252],"peer":[149],"reachability":[150],"without":[151,168],"rebuilding":[152],"substrate,":[155],"lost":[157],"coverage":[159],"through":[160],"bandwidth-aware":[162],"hierarchy,":[163],"reintegrates":[165],"repaired":[166],"ranks":[167,171],"forcing":[169],"healthy":[170],"recapture":[173],"their":[174],"graphs.":[176],"implement":[178],"in":[180],"an":[181,242,247],"stack":[184],"integrated":[185],"SGLang":[187],"evaluate":[189],"under":[191,219],"steady-state":[192,208],"serving,":[193,221],"recovery,":[195],"reintegration.":[198],"results":[200],"show":[201],"explicit":[203],"preserves":[206],"fast":[209],"path,":[210],"staying":[211],"4.4%":[213],"fixed-membership":[216,265],"DeepEP":[217],"baseline":[218,267],"static":[220],"turning":[223],"local":[225],"fault":[227],"from":[228],"whole-instance":[229],"downtime":[230],"two":[232],"bounded":[233],"interruptions.":[234],"On":[235],"single-rank":[237],"workload,":[239],"incurs":[241],"11s":[243],"recovery":[244],"pause":[245],"8s":[248],"reintegration":[249],"pause,":[250],"throughput":[253],"95%":[256],"pre-fault":[259],"level":[260],"52s,":[262],"whereas":[263],"full-restart":[266],"remains":[268],"unavailable":[269],"until":[270],"348s.":[271]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-13T00:00:00"}
