{"id":"https://openalex.org/W4415445364","doi":"https://doi.org/10.1109/lca.2025.3624158","title":"Reimagining RDMA Through the Lens of ML","display_name":"Reimagining RDMA Through the Lens of ML","publication_year":2025,"publication_date":"2025-07-01","ids":{"openalex":"https://openalex.org/W4415445364","doi":"https://doi.org/10.1109/lca.2025.3624158"},"language":null,"primary_location":{"id":"doi:10.1109/lca.2025.3624158","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lca.2025.3624158","pdf_url":null,"source":{"id":"https://openalex.org/S17643076","display_name":"IEEE Computer Architecture Letters","issn_l":"1556-6056","issn":["1556-6056","1556-6064","2473-2575"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Computer Architecture Letters","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2510.16606","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5033908933","display_name":"Ertza Warraich","orcid":null},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ertza Warraich","raw_affiliation_strings":["Department of Computer Science, Purdue University, West Lafayette, IN, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Purdue University, West Lafayette, IN, USA","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103926283","display_name":"Ali Imran","orcid":"https://orcid.org/0009-0009-7194-2357"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ali Imran","raw_affiliation_strings":["Department of Computer Science and Engineering, University of Michigan, Ann Arbor, MI, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, University of Michigan, Ann Arbor, MI, USA","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039603026","display_name":"Annus Zulfiqar","orcid":"https://orcid.org/0000-0003-0612-4939"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Annus Zulfiqar","raw_affiliation_strings":["Department of Computer Science and Engineering, University of Michigan, Ann Arbor, MI, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, University of Michigan, Ann Arbor, MI, USA","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078049483","display_name":"Shay Vargaftik","orcid":"https://orcid.org/0000-0002-0982-7894"},"institutions":[{"id":"https://openalex.org/I4210127325","display_name":"Broadcom (United States)","ror":"https://ror.org/035gt5s03","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127325"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shay Vargaftik","raw_affiliation_strings":["Broadcom, Palo Alto, CA, USA"],"affiliations":[{"raw_affiliation_string":"Broadcom, Palo Alto, CA, USA","institution_ids":["https://openalex.org/I4210127325"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014376333","display_name":"Sonia Fahmy","orcid":"https://orcid.org/0000-0003-2870-7166"},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sonia Fahmy","raw_affiliation_strings":["Department of Computer Science, Purdue University, West Lafayette, IN, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Purdue University, West Lafayette, IN, USA","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100610963","display_name":"Muhammad Shahbaz","orcid":"https://orcid.org/0000-0001-5168-9045"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Muhammad Shahbaz","raw_affiliation_strings":["Department of Computer Science and Engineering, University of Michigan, Ann Arbor, MI, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, University of Michigan, Ann Arbor, MI, USA","institution_ids":["https://openalex.org/I27837315"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5033908933"],"corresponding_institution_ids":["https://openalex.org/I219193219"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.15826362,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"24","issue":"2","first_page":"393","last_page":"396"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.5568000078201294,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.5568000078201294,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11450","display_name":"Model-Driven Software Engineering Techniques","score":0.5112000107765198,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/remote-direct-memory-access","display_name":"Remote direct memory access","score":0.9775000214576721},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7397000193595886},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.6532999873161316},{"id":"https://openalex.org/keywords/network-packet","display_name":"Network packet","score":0.6413000226020813},{"id":"https://openalex.org/keywords/infiniband","display_name":"InfiniBand","score":0.5993000268936157},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5170000195503235},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.4805999994277954},{"id":"https://openalex.org/keywords/stream-control-transmission-protocol","display_name":"Stream Control Transmission Protocol","score":0.4602999985218048}],"concepts":[{"id":"https://openalex.org/C130795937","wikidata":"https://www.wikidata.org/wiki/Q2561570","display_name":"Remote direct memory access","level":2,"score":0.9775000214576721},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8614000082015991},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7397000193595886},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.6532999873161316},{"id":"https://openalex.org/C158379750","wikidata":"https://www.wikidata.org/wiki/Q214111","display_name":"Network packet","level":2,"score":0.6413000226020813},{"id":"https://openalex.org/C2781030343","wikidata":"https://www.wikidata.org/wiki/Q922437","display_name":"InfiniBand","level":2,"score":0.5993000268936157},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.5917999744415283},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5170000195503235},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.4805999994277954},{"id":"https://openalex.org/C90377713","wikidata":"https://www.wikidata.org/wiki/Q576997","display_name":"Stream Control Transmission Protocol","level":3,"score":0.4602999985218048},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.4415999948978424},{"id":"https://openalex.org/C195563490","wikidata":"https://www.wikidata.org/wiki/Q180368","display_name":"Network congestion","level":3,"score":0.4377000033855438},{"id":"https://openalex.org/C54108766","wikidata":"https://www.wikidata.org/wiki/Q391064","display_name":"Packet loss","level":3,"score":0.4343000054359436},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.421999990940094},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.3824000060558319},{"id":"https://openalex.org/C64270927","wikidata":"https://www.wikidata.org/wiki/Q206924","display_name":"PCI Express","level":3,"score":0.3564999997615814},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.3231000006198883},{"id":"https://openalex.org/C20574231","wikidata":"https://www.wikidata.org/wiki/Q844605","display_name":"Backward compatibility","level":2,"score":0.3206999897956848},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3057999908924103},{"id":"https://openalex.org/C109751979","wikidata":"https://www.wikidata.org/wiki/Q998767","display_name":"Failover","level":2,"score":0.2928999960422516},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.27970001101493835},{"id":"https://openalex.org/C185177783","wikidata":"https://www.wikidata.org/wiki/Q3332814","display_name":"Megabit","level":2,"score":0.26840001344680786},{"id":"https://openalex.org/C2779585090","wikidata":"https://www.wikidata.org/wiki/Q3457762","display_name":"Resilience (materials science)","level":2,"score":0.26840001344680786},{"id":"https://openalex.org/C204156049","wikidata":"https://www.wikidata.org/wiki/Q751436","display_name":"Inter-process communication","level":2,"score":0.26820001006126404},{"id":"https://openalex.org/C138959212","wikidata":"https://www.wikidata.org/wiki/Q1806783","display_name":"Load balancing (electrical power)","level":3,"score":0.2549999952316284},{"id":"https://openalex.org/C193519340","wikidata":"https://www.wikidata.org/wiki/Q891179","display_name":"Data loss","level":2,"score":0.2531999945640564}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/lca.2025.3624158","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lca.2025.3624158","pdf_url":null,"source":{"id":"https://openalex.org/S17643076","display_name":"IEEE Computer Architecture Letters","issn_l":"1556-6056","issn":["1556-6056","1556-6064","2473-2575"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Computer Architecture Letters","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2510.16606","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2510.16606","pdf_url":"https://arxiv.org/pdf/2510.16606","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":null}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2510.16606","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2510.16606","pdf_url":"https://arxiv.org/pdf/2510.16606","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":null},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"distributed":[1],"machine":[2],"learning":[3],"(ML)":[4],"workloads":[5],"scale":[6,61],"to":[7,46,141,162,174],"thousands":[8],"of":[9,113],"GPUs":[10],"connected":[11],"by":[12,160,167],"ultra-high-speed":[13],"interconnects,":[14],"tail":[15],"latency":[16,59,159],"in":[17],"collective":[18],"communication":[19,126],"has":[20],"emerged":[21],"as":[22,131],"a":[23,78,176],"primary":[24],"bottleneck.":[25],"Prior":[26],"RDMA":[27,80,104],"designs,":[28],"like":[29],"RoCE,":[30],"IRN,":[31],"and":[32,37,43,58,99,124,134,169],"SRNIC,":[33],"enforce":[34],"strict":[35],"reliability":[36,85],"in-order":[38,100],"delivery,":[39],"relying":[40],"on":[41,88],"retransmissions":[42,98],"packet":[44,66],"sequencing":[45],"ensure":[47],"correctness.":[48],"While":[49],"effective":[50],"for":[51,91,181],"general-purpose":[52],"workloads,":[53],"these":[54],"mechanisms":[55,129],"introduce":[56,76],"complexity":[57],"that":[60,82,109,155],"poorly,":[62],"where":[63],"even":[64],"rare":[65],"losses":[67],"or":[68,93],"delays":[69],"can":[70],"consistently":[71],"degrade":[72],"system":[73],"performance.":[74],"We":[75],"Celeris,":[77],"domain-specific":[79],"transport":[81,108,179],"revisits":[83],"traditional":[84],"guarantees":[86],"based":[87],"ML's":[89],"tolerance":[90],"lost":[92],"partial":[94],"data.":[95],"Celeris":[96,156],"removes":[97],"delivery":[101],"from":[102],"the":[103,111,142,149],"NIC,":[105],"enabling":[106],"best-effort":[107],"exploits":[110],"robustness":[112],"ML":[114,143,182],"workloads.":[115],"It":[116],"retains":[117],"congestion":[118],"control":[119],"(<italic":[120,145],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[121,146],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">e.g.</i>,":[122,147],"DCQCN)":[123],"manages":[125],"with":[127],"software-level":[128],"such":[130],"adaptive":[132],"timeouts":[133],"data":[135],"prioritization,":[136],"while":[137],"shifting":[138],"loss":[139],"recovery":[140],"pipeline":[144],"using":[148],"Hadamard":[150],"Transform).":[151],"Early":[152],"results":[153],"show":[154],"reduces":[157],"99th-percentile":[158],"up":[161],"2.3\u00d7,":[163],"cuts":[164],"BRAM":[165],"usage":[166],"67%,":[168],"nearly":[170],"doubles":[171],"NIC":[172],"resilience":[173],"faults\u2014delivering":[175],"resilient,":[177],"scalable":[178],"tailored":[180],"at":[183],"cluster":[184],"scale.":[185]},"counts_by_year":[],"updated_date":"2026-04-04T08:04:53.788161","created_date":"2025-10-24T00:00:00"}
