{"id":"https://openalex.org/W4416004439","doi":"https://doi.org/10.1145/3731599.3767453","title":"Scaling LLM Training Using RDMA over Converged Ethernet","display_name":"Scaling LLM Training Using RDMA over Converged Ethernet","publication_year":2025,"publication_date":"2025-11-07","ids":{"openalex":"https://openalex.org/W4416004439","doi":"https://doi.org/10.1145/3731599.3767453"},"language":null,"primary_location":{"id":"doi:10.1145/3731599.3767453","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3731599.3767453","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120297667","display_name":"Alex Batlle Casellas","orcid":"https://orcid.org/0009-0006-1469-0358"},"institutions":[{"id":"https://openalex.org/I19268510","display_name":"Qualcomm (United Kingdom)","ror":"https://ror.org/04d3djg48","country_code":"GB","type":"company","lineage":["https://openalex.org/I19268510","https://openalex.org/I4210087596"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Alex Batlle Casellas","raw_affiliation_strings":["Qualcomm Europe, Inc., Barcelona, Spain"],"raw_orcid":"https://orcid.org/0009-0006-1469-0358","affiliations":[{"raw_affiliation_string":"Qualcomm Europe, Inc., Barcelona, Spain","institution_ids":["https://openalex.org/I19268510"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049023670","display_name":"Adri\u00e1n Perez Di\u00e9guez","orcid":"https://orcid.org/0000-0001-7168-9050"},"institutions":[{"id":"https://openalex.org/I4210087596","display_name":"Qualcomm (United States)","ror":"https://ror.org/002zrf773","country_code":"US","type":"company","lineage":["https://openalex.org/I4210087596"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Adri\u00e1n P\u00e9rez Di\u00e9guez","raw_affiliation_strings":["Qualcomm Technologies, Inc., San Diego, USA"],"raw_orcid":"https://orcid.org/0000-0001-7168-9050","affiliations":[{"raw_affiliation_string":"Qualcomm Technologies, Inc., San Diego, USA","institution_ids":["https://openalex.org/I4210087596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120297668","display_name":"Aleix Torres-Camps","orcid":"https://orcid.org/0009-0007-2473-3590"},"institutions":[{"id":"https://openalex.org/I19268510","display_name":"Qualcomm (United Kingdom)","ror":"https://ror.org/04d3djg48","country_code":"GB","type":"company","lineage":["https://openalex.org/I19268510","https://openalex.org/I4210087596"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Aleix Torres-Camps","raw_affiliation_strings":["Qualcomm Europe, Inc., Barcelona, Spain"],"raw_orcid":"https://orcid.org/0009-0007-2473-3590","affiliations":[{"raw_affiliation_string":"Qualcomm Europe, Inc., Barcelona, Spain","institution_ids":["https://openalex.org/I19268510"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070190311","display_name":"Harris Teague","orcid":"https://orcid.org/0000-0002-9409-5674"},"institutions":[{"id":"https://openalex.org/I4210087596","display_name":"Qualcomm (United States)","ror":"https://ror.org/002zrf773","country_code":"US","type":"company","lineage":["https://openalex.org/I4210087596"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Harris Teague","raw_affiliation_strings":["Qualcomm Technologies, Inc., San Diego, USA"],"raw_orcid":"https://orcid.org/0000-0002-9409-5674","affiliations":[{"raw_affiliation_string":"Qualcomm Technologies, Inc., San Diego, USA","institution_ids":["https://openalex.org/I4210087596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120297669","display_name":"Arnau Padres Masdemont","orcid":"https://orcid.org/0009-0004-9907-3548"},"institutions":[{"id":"https://openalex.org/I19268510","display_name":"Qualcomm (United Kingdom)","ror":"https://ror.org/04d3djg48","country_code":"GB","type":"company","lineage":["https://openalex.org/I19268510","https://openalex.org/I4210087596"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Arnau Padres Masdemont","raw_affiliation_strings":["Qualcomm Europe, Inc., Barcelona, Spain"],"raw_orcid":"https://orcid.org/0009-0004-9907-3548","affiliations":[{"raw_affiliation_string":"Qualcomm Europe, Inc., Barcelona, Spain","institution_ids":["https://openalex.org/I19268510"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5011003534","display_name":"Jordi Ros-Giralt","orcid":"https://orcid.org/0000-0003-4450-609X"},"institutions":[{"id":"https://openalex.org/I19268510","display_name":"Qualcomm (United Kingdom)","ror":"https://ror.org/04d3djg48","country_code":"GB","type":"company","lineage":["https://openalex.org/I19268510","https://openalex.org/I4210087596"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Jordi Ros-Giralt","raw_affiliation_strings":["Qualcomm Europe, Inc., Barcelona, Spain"],"raw_orcid":"https://orcid.org/0000-0003-4450-609X","affiliations":[{"raw_affiliation_string":"Qualcomm Europe, Inc., Barcelona, Spain","institution_ids":["https://openalex.org/I19268510"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5120297667"],"corresponding_institution_ids":["https://openalex.org/I19268510"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.36968418,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"886","last_page":"896"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.26809999346733093,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.26809999346733093,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10138","display_name":"Network Traffic and Congestion Control","score":0.25519999861717224,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10847","display_name":"Advanced Optical Network Technologies","score":0.16040000319480896,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/infiniband","display_name":"InfiniBand","score":0.9812999963760376},{"id":"https://openalex.org/keywords/remote-direct-memory-access","display_name":"Remote direct memory access","score":0.895799994468689},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6086999773979187},{"id":"https://openalex.org/keywords/ethernet","display_name":"Ethernet","score":0.5916000008583069},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5547999739646912},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.4431999921798706}],"concepts":[{"id":"https://openalex.org/C2781030343","wikidata":"https://www.wikidata.org/wiki/Q922437","display_name":"InfiniBand","level":2,"score":0.9812999963760376},{"id":"https://openalex.org/C130795937","wikidata":"https://www.wikidata.org/wiki/Q2561570","display_name":"Remote direct memory access","level":2,"score":0.895799994468689},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7738999724388123},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6086999773979187},{"id":"https://openalex.org/C172173386","wikidata":"https://www.wikidata.org/wiki/Q79984","display_name":"Ethernet","level":2,"score":0.5916000008583069},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5547999739646912},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.4431999921798706},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.4422000050544739},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.42160001397132874},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.35440000891685486},{"id":"https://openalex.org/C53833338","wikidata":"https://www.wikidata.org/wiki/Q1061424","display_name":"Context switch","level":2,"score":0.3538999855518341},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.33469998836517334},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.2865999937057495},{"id":"https://openalex.org/C2776379158","wikidata":"https://www.wikidata.org/wiki/Q1069084","display_name":"Gigabit Ethernet","level":3,"score":0.25540000200271606},{"id":"https://openalex.org/C151898751","wikidata":"https://www.wikidata.org/wiki/Q5046350","display_name":"Carrier Ethernet","level":3,"score":0.2551000118255615}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3731599.3767453","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3731599.3767453","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":7,"referenced_works":["https://openalex.org/W2064332668","https://openalex.org/W2130531694","https://openalex.org/W2498764059","https://openalex.org/W3016926008","https://openalex.org/W3129831491","https://openalex.org/W4234143859","https://openalex.org/W4401176521"],"related_works":[],"abstract_inverted_index":{"We":[0],"present":[1],"a":[2,54],"comprehensive":[3],"benchmarking":[4],"study":[5],"that":[6,65],"evaluates":[7],"the":[8,23],"scaling":[9,70],"performance":[10,71],"of":[11,25],"RDMA":[12],"over":[13],"Converged":[14],"Ethernet":[15,50],"(RoCE)":[16],"and":[17,44,51,85,92,101],"compares":[18],"it":[19,40],"with":[20],"InfiniBand":[21,31,74],"in":[22],"context":[24],"large-scale":[26],"LLM":[27],"training":[28],"workloads.":[29],"While":[30],"is":[32],"traditionally":[33],"favored":[34],"for":[35,98],"its":[36],"low-latency,":[37],"high-bandwidth":[38],"characteristics,":[39],"imposes":[41],"significant":[42],"infrastructure":[43],"operational":[45],"costs.":[46],"RoCE,":[47],"leveraging":[48],"commodity":[49],"RDMA,":[52],"offers":[53],"cost-effective":[55],"alternative.":[56],"Through":[57],"extensive":[58],"experiments":[59],"on":[60],"production":[61],"clusters,":[62],"we":[63],"demonstrate":[64],"RoCE":[66],"can":[67],"achieve":[68],"near-linear":[69],"comparable":[72],"to":[73],"when":[75],"properly":[76],"configured.":[77],"Our":[78],"analysis":[79],"spans":[80],"data":[81],"sharding":[82],"strategies,":[83],"quantization":[84],"activation":[86],"recomputation":[87],"techniques,":[88],"batch":[89],"size":[90],"tuning,":[91],"system-level":[93],"optimizations,":[94],"providing":[95],"practical":[96],"guidance":[97],"designing":[99],"scalable":[100],"efficient":[102],"AI":[103],"infrastructure.":[104]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-11-07T00:00:00"}
