{"id":"https://openalex.org/W7125586039","doi":"https://doi.org/10.1109/tnse.2026.3657102","title":"Enhancing Remote Direct Memory Access Network With Loss-Tolerant Flow Control for Distributed AI Clusters","display_name":"Enhancing Remote Direct Memory Access Network With Loss-Tolerant Flow Control for Distributed AI Clusters","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7125586039","doi":"https://doi.org/10.1109/tnse.2026.3657102"},"language":null,"primary_location":{"id":"doi:10.1109/tnse.2026.3657102","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tnse.2026.3657102","pdf_url":null,"source":{"id":"https://openalex.org/S2484352698","display_name":"IEEE Transactions on Network Science and Engineering","issn_l":"2327-4697","issn":["2327-4697","2334-329X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Network Science and Engineering","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yibo Wang","orcid":"https://orcid.org/0009-0005-1537-8633"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yibo Wang","raw_affiliation_strings":["School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0005-1537-8633","affiliations":[{"raw_affiliation_string":"School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123783459","display_name":"Wei Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Wang","raw_affiliation_strings":["School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-0503-2816","affiliations":[{"raw_affiliation_string":"School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001549734","display_name":"Qiaojun Hu","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qiaojun Hu","raw_affiliation_strings":["School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0008-3723-2186","affiliations":[{"raw_affiliation_string":"School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yiyang Li","orcid":"https://orcid.org/0009-0005-4383-9564"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yiyang Li","raw_affiliation_strings":["School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0005-4383-9564","affiliations":[{"raw_affiliation_string":"School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047239114","display_name":"Yanran Xiao","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanran Xiao","raw_affiliation_strings":["School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030811766","display_name":"Yongli Zhao","orcid":"https://orcid.org/0000-0003-3716-8248"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yongli Zhao","raw_affiliation_strings":["School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-3716-8248","affiliations":[{"raw_affiliation_string":"School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Xiaoyu Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210130112","display_name":"China Academy of Information and Communications Technology","ror":"https://ror.org/038dte259","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210130112","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoyu Wang","raw_affiliation_strings":["China Academy of Information and Communications Technology, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"China Academy of Information and Communications Technology, Beijing, China","institution_ids":["https://openalex.org/I4210130112"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5123728221","display_name":"Jie Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jie Zhang","raw_affiliation_strings":["School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-7750-2197","affiliations":[{"raw_affiliation_string":"School of Electronic Engineering, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I139759216"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.10813653,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"13","issue":null,"first_page":"6640","last_page":"6653"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.24070000648498535,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.24070000648498535,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.19439999759197235,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.14659999310970306,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/retransmission","display_name":"Retransmission","score":0.7773000001907349},{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.7203999757766724},{"id":"https://openalex.org/keywords/packet-loss","display_name":"Packet loss","score":0.5268999934196472},{"id":"https://openalex.org/keywords/network-packet","display_name":"Network packet","score":0.5127000212669373},{"id":"https://openalex.org/keywords/flow-control","display_name":"Flow control (data)","score":0.508899986743927},{"id":"https://openalex.org/keywords/payload","display_name":"Payload (computing)","score":0.48510000109672546},{"id":"https://openalex.org/keywords/network-congestion","display_name":"Network congestion","score":0.45089998841285706},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.42340001463890076},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.42309999465942383},{"id":"https://openalex.org/keywords/timer","display_name":"Timer","score":0.4198000133037567}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8172000050544739},{"id":"https://openalex.org/C180611318","wikidata":"https://www.wikidata.org/wiki/Q7316902","display_name":"Retransmission","level":3,"score":0.7773000001907349},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.7203999757766724},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.5282999873161316},{"id":"https://openalex.org/C54108766","wikidata":"https://www.wikidata.org/wiki/Q391064","display_name":"Packet loss","level":3,"score":0.5268999934196472},{"id":"https://openalex.org/C158379750","wikidata":"https://www.wikidata.org/wiki/Q214111","display_name":"Network packet","level":2,"score":0.5127000212669373},{"id":"https://openalex.org/C186766456","wikidata":"https://www.wikidata.org/wiki/Q612457","display_name":"Flow control (data)","level":2,"score":0.508899986743927},{"id":"https://openalex.org/C134066672","wikidata":"https://www.wikidata.org/wiki/Q1424639","display_name":"Payload (computing)","level":3,"score":0.48510000109672546},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.45260000228881836},{"id":"https://openalex.org/C195563490","wikidata":"https://www.wikidata.org/wiki/Q180368","display_name":"Network congestion","level":3,"score":0.45089998841285706},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.42340001463890076},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.42309999465942383},{"id":"https://openalex.org/C2776633867","wikidata":"https://www.wikidata.org/wiki/Q186612","display_name":"Timer","level":3,"score":0.4198000133037567},{"id":"https://openalex.org/C81081738","wikidata":"https://www.wikidata.org/wiki/Q55542","display_name":"Lossless compression","level":3,"score":0.40799999237060547},{"id":"https://openalex.org/C113508815","wikidata":"https://www.wikidata.org/wiki/Q193446","display_name":"Packet switching","level":3,"score":0.38119998574256897},{"id":"https://openalex.org/C527821871","wikidata":"https://www.wikidata.org/wiki/Q228502","display_name":"Access control","level":2,"score":0.37470000982284546},{"id":"https://openalex.org/C40842320","wikidata":"https://www.wikidata.org/wiki/Q19423","display_name":"Buffer overflow","level":2,"score":0.35569998621940613},{"id":"https://openalex.org/C160191386","wikidata":"https://www.wikidata.org/wiki/Q868299","display_name":"Control flow","level":2,"score":0.3467000126838684},{"id":"https://openalex.org/C180591934","wikidata":"https://www.wikidata.org/wiki/Q1253369","display_name":"Downtime","level":2,"score":0.34540000557899475},{"id":"https://openalex.org/C203274722","wikidata":"https://www.wikidata.org/wiki/Q7001161","display_name":"Network performance","level":2,"score":0.3416999876499176},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.33230000734329224},{"id":"https://openalex.org/C38349280","wikidata":"https://www.wikidata.org/wiki/Q1434290","display_name":"Flow (mathematics)","level":2,"score":0.32679998874664307},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3240000009536743},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.3237000107765198},{"id":"https://openalex.org/C138236772","wikidata":"https://www.wikidata.org/wiki/Q25098575","display_name":"Edge device","level":3,"score":0.31349998712539673},{"id":"https://openalex.org/C123745756","wikidata":"https://www.wikidata.org/wiki/Q1665949","display_name":"Interconnection","level":2,"score":0.29649999737739563},{"id":"https://openalex.org/C165021410","wikidata":"https://www.wikidata.org/wiki/Q55564","display_name":"Lossy compression","level":2,"score":0.2802000045776367},{"id":"https://openalex.org/C159631557","wikidata":"https://www.wikidata.org/wiki/Q1546066","display_name":"Networking hardware","level":2,"score":0.27639999985694885},{"id":"https://openalex.org/C197417287","wikidata":"https://www.wikidata.org/wiki/Q7098837","display_name":"Optical burst switching","level":5,"score":0.26330000162124634},{"id":"https://openalex.org/C114809511","wikidata":"https://www.wikidata.org/wiki/Q1412924","display_name":"Flow network","level":2,"score":0.2630000114440918},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.26190000772476196},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.26190000772476196},{"id":"https://openalex.org/C149810388","wikidata":"https://www.wikidata.org/wiki/Q5374873","display_name":"Emulation","level":2,"score":0.2558000087738037},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.2547000050544739},{"id":"https://openalex.org/C3770464","wikidata":"https://www.wikidata.org/wiki/Q775963","display_name":"Smoothing","level":2,"score":0.2540000081062317}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tnse.2026.3657102","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tnse.2026.3657102","pdf_url":null,"source":{"id":"https://openalex.org/S2484352698","display_name":"IEEE Transactions on Network Science and Engineering","issn_l":"2327-4697","issn":["2327-4697","2334-329X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Network Science and Engineering","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":64,"referenced_works":["https://openalex.org/W1984788566","https://openalex.org/W2057332538","https://openalex.org/W2064675550","https://openalex.org/W2083842231","https://openalex.org/W2100785182","https://openalex.org/W2164740236","https://openalex.org/W2194775991","https://openalex.org/W2298436731","https://openalex.org/W2440033994","https://openalex.org/W2498764059","https://openalex.org/W2558580397","https://openalex.org/W2618530766","https://openalex.org/W2743073148","https://openalex.org/W2744387122","https://openalex.org/W2794896506","https://openalex.org/W2809353470","https://openalex.org/W2891983498","https://openalex.org/W2926543120","https://openalex.org/W2944456270","https://openalex.org/W2963433607","https://openalex.org/W2964412985","https://openalex.org/W2967496857","https://openalex.org/W2968108410","https://openalex.org/W2968755040","https://openalex.org/W2975712713","https://openalex.org/W3002973041","https://openalex.org/W3004095116","https://openalex.org/W3035748372","https://openalex.org/W3046330538","https://openalex.org/W3046470751","https://openalex.org/W3046750740","https://openalex.org/W3091097978","https://openalex.org/W3093895572","https://openalex.org/W3106918569","https://openalex.org/W3109063463","https://openalex.org/W3190053701","https://openalex.org/W3193378579","https://openalex.org/W4224608762","https://openalex.org/W4232284301","https://openalex.org/W4235732943","https://openalex.org/W4236530411","https://openalex.org/W4240522432","https://openalex.org/W4246193833","https://openalex.org/W4285180525","https://openalex.org/W4289535786","https://openalex.org/W4296915419","https://openalex.org/W4309229665","https://openalex.org/W4380303566","https://openalex.org/W4383212214","https://openalex.org/W4385367428","https://openalex.org/W4386249662","https://openalex.org/W4387831809","https://openalex.org/W4389161427","https://openalex.org/W4391248729","https://openalex.org/W4391945833","https://openalex.org/W4401176820","https://openalex.org/W4402775336","https://openalex.org/W4402897059","https://openalex.org/W4408251903","https://openalex.org/W4409356441","https://openalex.org/W4411232542","https://openalex.org/W4413318953","https://openalex.org/W4413756872","https://openalex.org/W4414539025"],"related_works":[],"abstract_inverted_index":{"Current":[0],"AI":[1],"training":[2],"clusters":[3],"widely":[4],"utilize":[5],"RoCEv2":[6,17],"to":[7,24,48,84,90,101,127,135,178,186],"enhance":[8],"the":[9,50,92,97,102,108,111,146,157,169],"communication":[10],"efficiency":[11],"of":[12,67,150,159],"interconnect":[13],"networks":[14],"across":[15],"machines.":[16],"relies":[18],"on":[19,82,152],"priority":[20],"flow":[21,75,171],"control":[22,76,91],"(PFC)":[23],"ensure":[25,85],"a":[26,73,86,124],"lossless":[27,87],"network.":[28],"However,":[29],"PFC":[30,83,112],"brings":[31],"specific":[32],"side":[33,51],"effects,":[34],"such":[35],"as":[36],"head-of-line":[37],"blocking,":[38],"congestion":[39],"spreading,":[40],"and":[41,155,180],"deadlock.":[42],"Numerous":[43],"studies":[44],"have":[45],"been":[46],"proposed":[47],"eliminate":[49,128],"effects.":[52],"Unlike":[53],"traditional":[54,103],"high-performance":[55],"computing":[56],"applications,":[57],"distributed":[58],"machine":[59],"learning":[60],"(DML)":[61],"is":[62],"bounded-loss":[63,147],"tolerant.":[64],"In":[65],"light":[66],"this":[68,70],"observation,":[69],"paper":[71],"proposes":[72],"loss-tolerant":[74],"(LTFC).":[77],"LTFC":[78,106,134,160,167],"does":[79],"not":[80],"rely":[81],"environment,":[88],"but":[89],"packet":[93],"loss":[94],"ratio":[95],"within":[96],"tolerance":[98,148],"threshold.":[99],"Compared":[100],"trigger":[104],"condition,":[105],"reduces":[107,168],"likelihood":[109],"that":[110,166],"will":[113],"be":[114],"triggered.":[115],"Additionally,":[116],"we":[117,132],"replace":[118],"RoCEv2's":[119],"default":[120],"Go-back-N":[121],"mechanism":[122,126],"with":[123],"non-retransmission":[125],"retransmission":[129],"delay.":[130],"Further,":[131],"adapt":[133],"high":[136],"network":[137],"load":[138],"scenarios":[139],"through":[140],"loss-aware":[141],"buffer":[142],"management.":[143],"We":[144],"demonstrate":[145],"feature":[149],"DML":[151],"our":[153],"testbed":[154],"evaluate":[156],"performance":[158],"in":[161],"simulations.":[162],"Simulation":[163],"results":[164],"show":[165],"average":[170],"completion":[172],"time":[173],"(FCT)":[174],"slowdown":[175,183],"by":[176,184],"up":[177,185],"27.2%":[179],"tail":[181],"FCT":[182],"23.8%.":[187]},"counts_by_year":[],"updated_date":"2026-02-17T06:05:46.635709","created_date":"2026-01-25T00:00:00"}
