{"id":"https://openalex.org/W7147667792","doi":"https://doi.org/10.48550/arxiv.2603.28001","title":"Varuna: Enabling Failure-Type Aware RDMA Failover","display_name":"Varuna: Enabling Failure-Type Aware RDMA Failover","publication_year":2026,"publication_date":"2026-03-30","ids":{"openalex":"https://openalex.org/W7147667792","doi":"https://doi.org/10.48550/arxiv.2603.28001"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.28001","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.28001","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.28001","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132554678","display_name":"Xiaoyang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Xiaoyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101970462","display_name":"Yongkun Li","orcid":"https://orcid.org/0000-0002-3743-8511"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yongkun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063339501","display_name":"Lulu Yao","orcid":"https://orcid.org/0000-0001-9116-0330"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Lulu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101608275","display_name":"Guoli Wei","orcid":"https://orcid.org/0000-0001-6028-1586"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Guoli","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132652311","display_name":"Longcheng Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Longcheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132577793","display_name":"Yinlong Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Yinlong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132701102","display_name":"Weiqing Kong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kong, Weiqing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011668333","display_name":"Weiguang Wang","orcid":"https://orcid.org/0009-0004-3301-465X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Weiguang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132604012","display_name":"Peng Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Peng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5035456549","display_name":"Bingyang Liu","orcid":"https://orcid.org/0000-0002-4572-4956"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Bingyang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5132554678"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.7379999756813049,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.7379999756813049,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.09000000357627869,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.07010000199079514,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/remote-direct-memory-access","display_name":"Remote direct memory access","score":0.9387999773025513},{"id":"https://openalex.org/keywords/failover","display_name":"Failover","score":0.876800000667572},{"id":"https://openalex.org/keywords/retransmission","display_name":"Retransmission","score":0.8766000270843506},{"id":"https://openalex.org/keywords/backup","display_name":"Backup","score":0.5273000001907349},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5087000131607056},{"id":"https://openalex.org/keywords/timestamp","display_name":"Timestamp","score":0.32580000162124634}],"concepts":[{"id":"https://openalex.org/C130795937","wikidata":"https://www.wikidata.org/wiki/Q2561570","display_name":"Remote direct memory access","level":2,"score":0.9387999773025513},{"id":"https://openalex.org/C109751979","wikidata":"https://www.wikidata.org/wiki/Q998767","display_name":"Failover","level":2,"score":0.876800000667572},{"id":"https://openalex.org/C180611318","wikidata":"https://www.wikidata.org/wiki/Q7316902","display_name":"Retransmission","level":3,"score":0.8766000270843506},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8264999985694885},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.6380000114440918},{"id":"https://openalex.org/C2780945871","wikidata":"https://www.wikidata.org/wiki/Q194274","display_name":"Backup","level":2,"score":0.5273000001907349},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.5126000046730042},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5087000131607056},{"id":"https://openalex.org/C113954288","wikidata":"https://www.wikidata.org/wiki/Q186885","display_name":"Timestamp","level":2,"score":0.32580000162124634},{"id":"https://openalex.org/C50050167","wikidata":"https://www.wikidata.org/wiki/Q4300498","display_name":"Slumping","level":2,"score":0.3249000012874603},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.3237999975681305},{"id":"https://openalex.org/C503016453","wikidata":"https://www.wikidata.org/wiki/Q4839801","display_name":"Backup software","level":3,"score":0.3005000054836273},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.263700008392334},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.25690001249313354}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.28001","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.28001","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.28001","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.28001","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"RDMA":[0,37,97,115,157,190],"link":[1,25,29,119],"failures":[2],"can":[3,54,88],"render":[4],"connections":[5],"temporarily":[6],"unavailable,":[7],"causing":[8],"both":[9],"performance":[10],"degradation":[11],"and":[12,59,104,131,143,155,179,185],"significant":[13],"recovery":[14,98,173],"overhead.":[15],"To":[16],"tolerate":[17],"such":[18,47],"failures,":[19],"production":[20],"datacenters":[21],"assign":[22],"each":[23],"primary":[24],"with":[26],"a":[27,95,109,118],"standby":[28],"and,":[30],"upon":[31],"failure,":[32,120],"uniformly":[33],"retransmit":[34],"all":[35],"in-flight":[36,126],"request":[38],"over":[39],"the":[40,65,140,145],"backup":[41],"path.":[42],"However,":[43],"we":[44],"observe":[45],"that":[46,100],"blanket":[48],"retransmission":[49,103,174],"is":[50,73],"unnecessary.":[51],"In-flight":[52],"requests":[53,72,127],"be":[55],"split":[56],"into":[57],"pre-failure":[58,141],"post-failure":[60,71,149],"categories":[61],"depending":[62],"on":[63,113],"whether":[64],"responder":[66],"has":[67],"already":[68],"executed.":[69],"Retransmitting":[70],"not":[74],"only":[75,139,162],"redundant":[76],"(consuming":[77],"bandwidth),":[78],"but":[79],"also":[80],"incorrect":[81],"for":[82,148],"non-idempotent":[83],"operations,":[84],"where":[85],"duplicate":[86],"execution":[87],"violate":[89],"application":[90],"semantics.":[91],"We":[92],"present":[93],"Varuna,":[94],"failure-type-aware":[96],"mechanism":[99],"enables":[101],"correct":[102],"us-level":[105],"failover.":[106,191],"Varuna":[107,136,160],"piggybacks":[108],"lightweight":[110],"completion":[111],"log":[112,122],"every":[114],"operation;":[116],"after":[117],"this":[121],"deterministically":[123],"reveals":[124],"which":[125,132],"were":[128,133],"executed":[129],"(post-failure)":[130],"lost":[134],"(pre-failure).":[135],"then":[137],"retransmits":[138],"subset":[142],"fetches/recovers":[144],"return":[146],"values":[147],"requests.":[150],"Evaluated":[151],"using":[152],"synthetic":[153],"microbenchmarks":[154],"end-to-end":[156],"TPC-C":[158],"transactions,":[159],"incurs":[161],"0.6-10%":[163],"steady-state":[164],"latency":[165],"overhead":[166,184,188],"in":[167],"realistic":[168],"applications,":[169],"eliminates":[170],"65%":[171],"of":[172],"time,":[175],"preserves":[176],"transactional":[177],"consistency,":[178],"introduces":[180],"zero":[181],"connectivity":[182],"rebuild":[183],"negligible":[186],"memory":[187],"during":[189]},"counts_by_year":[],"updated_date":"2026-04-02T13:53:19.096889","created_date":"2026-04-02T00:00:00"}
