{"id":"https://openalex.org/W7123649291","doi":"https://doi.org/10.1145/3772052.3772225","title":"THORN-ML: Transparent Hardware Offloaded Resilient Networks for RDMA based Distributed ML Workloads","display_name":"THORN-ML: Transparent Hardware Offloaded Resilient Networks for RDMA based Distributed ML Workloads","publication_year":2025,"publication_date":"2025-11-19","ids":{"openalex":"https://openalex.org/W7123649291","doi":"https://doi.org/10.1145/3772052.3772225"},"language":null,"primary_location":{"id":"doi:10.1145/3772052.3772225","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3772052.3772225","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM Symposium on Cloud Computing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3772052.3772225","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5047682611","display_name":"Maziyar Nazari","orcid":"https://orcid.org/0000-0002-7355-2426"},"institutions":[{"id":"https://openalex.org/I188538660","display_name":"University of Colorado Boulder","ror":"https://ror.org/02ttsq026","country_code":"US","type":"education","lineage":["https://openalex.org/I188538660"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Maziyar Nazari","raw_affiliation_strings":["University of Colorado Boulder, Boulder, USA"],"raw_orcid":"https://orcid.org/0000-0002-7355-2426","affiliations":[{"raw_affiliation_string":"University of Colorado Boulder, Boulder, USA","institution_ids":["https://openalex.org/I188538660"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040193265","display_name":"Daniel W. Noland","orcid":null},"institutions":[{"id":"https://openalex.org/I2799870806","display_name":"Longmont United Hospital","ror":"https://ror.org/047ge9e14","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I2799870806"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Daniel Noland","raw_affiliation_strings":["Unaffiliated, Longmont, CO, USA"],"raw_orcid":"https://orcid.org/0009-0005-3340-5316","affiliations":[{"raw_affiliation_string":"Unaffiliated, Longmont, CO, USA","institution_ids":["https://openalex.org/I2799870806"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069596582","display_name":"Giulio Sidoretti","orcid":"https://orcid.org/0000-0002-7317-1834"},"institutions":[{"id":"https://openalex.org/I188538660","display_name":"University of Colorado Boulder","ror":"https://ror.org/02ttsq026","country_code":"US","type":"education","lineage":["https://openalex.org/I188538660"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Giulio Sidoretti","raw_affiliation_strings":["University of Colorado Boulder, Boulder, CO, USA"],"raw_orcid":"https://orcid.org/0000-0002-7317-1834","affiliations":[{"raw_affiliation_string":"University of Colorado Boulder, Boulder, CO, USA","institution_ids":["https://openalex.org/I188538660"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039436819","display_name":"Erika Hunhoff","orcid":null},"institutions":[{"id":"https://openalex.org/I188538660","display_name":"University of Colorado Boulder","ror":"https://ror.org/02ttsq026","country_code":"US","type":"education","lineage":["https://openalex.org/I188538660"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Erika Hunhoff","raw_affiliation_strings":["University of Colorado Boulder, Boulder, CO, USA"],"raw_orcid":"https://orcid.org/0000-0001-5499-8871","affiliations":[{"raw_affiliation_string":"University of Colorado Boulder, Boulder, CO, USA","institution_ids":["https://openalex.org/I188538660"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020218455","display_name":"Tamara Silbergleit Lehman","orcid":"https://orcid.org/0000-0001-9779-1838"},"institutions":[{"id":"https://openalex.org/I188538660","display_name":"University of Colorado Boulder","ror":"https://ror.org/02ttsq026","country_code":"US","type":"education","lineage":["https://openalex.org/I188538660"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tamara Silbergleit Lehman","raw_affiliation_strings":["University of Colorado Boulder, Boulder, CO, USA"],"raw_orcid":"https://orcid.org/0000-0001-9779-1838","affiliations":[{"raw_affiliation_string":"University of Colorado Boulder, Boulder, CO, USA","institution_ids":["https://openalex.org/I188538660"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5025861504","display_name":"Eric Keller","orcid":"https://orcid.org/0000-0003-2556-9394"},"institutions":[{"id":"https://openalex.org/I188538660","display_name":"University of Colorado Boulder","ror":"https://ror.org/02ttsq026","country_code":"US","type":"education","lineage":["https://openalex.org/I188538660"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Eric Keller","raw_affiliation_strings":["University of Colorado Boulder, Boulder, CO, USA"],"raw_orcid":"https://orcid.org/0000-0003-2556-9394","affiliations":[{"raw_affiliation_string":"University of Colorado Boulder, Boulder, CO, USA","institution_ids":["https://openalex.org/I188538660"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.75338527,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"196","last_page":"208"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.35420000553131104,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.35420000553131104,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.24130000174045563,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.11819999665021896,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/host","display_name":"Host (biology)","score":0.771399974822998},{"id":"https://openalex.org/keywords/remote-direct-memory-access","display_name":"Remote direct memory access","score":0.7458000183105469},{"id":"https://openalex.org/keywords/networking-hardware","display_name":"Networking hardware","score":0.5871999859809875},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.5705999732017517},{"id":"https://openalex.org/keywords/data-center","display_name":"Data center","score":0.5073000192642212},{"id":"https://openalex.org/keywords/network-packet","display_name":"Network packet","score":0.5012999773025513},{"id":"https://openalex.org/keywords/server","display_name":"Server","score":0.3540000021457672},{"id":"https://openalex.org/keywords/multi-core-processor","display_name":"Multi-core processor","score":0.35370001196861267},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.352400004863739}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8220000267028809},{"id":"https://openalex.org/C126831891","wikidata":"https://www.wikidata.org/wiki/Q221673","display_name":"Host (biology)","level":2,"score":0.771399974822998},{"id":"https://openalex.org/C130795937","wikidata":"https://www.wikidata.org/wiki/Q2561570","display_name":"Remote direct memory access","level":2,"score":0.7458000183105469},{"id":"https://openalex.org/C159631557","wikidata":"https://www.wikidata.org/wiki/Q1546066","display_name":"Networking hardware","level":2,"score":0.5871999859809875},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.5705999732017517},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.5192999839782715},{"id":"https://openalex.org/C153740404","wikidata":"https://www.wikidata.org/wiki/Q671224","display_name":"Data center","level":2,"score":0.5073000192642212},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.503600001335144},{"id":"https://openalex.org/C158379750","wikidata":"https://www.wikidata.org/wiki/Q214111","display_name":"Network packet","level":2,"score":0.5012999773025513},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3935999870300293},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.36500000953674316},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.3540000021457672},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.35370001196861267},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.352400004863739},{"id":"https://openalex.org/C74366991","wikidata":"https://www.wikidata.org/wiki/Q2755335","display_name":"Network processor","level":3,"score":0.3382999897003174},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.335999995470047},{"id":"https://openalex.org/C65813073","wikidata":"https://www.wikidata.org/wiki/Q1622420","display_name":"High availability","level":2,"score":0.30640000104904175},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.3009999990463257},{"id":"https://openalex.org/C193415008","wikidata":"https://www.wikidata.org/wiki/Q639681","display_name":"Network architecture","level":2,"score":0.2935999929904938},{"id":"https://openalex.org/C172086080","wikidata":"https://www.wikidata.org/wiki/Q62270","display_name":"Remote procedure call","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.28769999742507935},{"id":"https://openalex.org/C29140674","wikidata":"https://www.wikidata.org/wiki/Q206637","display_name":"Computer cluster","level":2,"score":0.2702000141143799},{"id":"https://openalex.org/C2780385302","wikidata":"https://www.wikidata.org/wiki/Q367158","display_name":"Protocol (science)","level":3,"score":0.26510000228881836},{"id":"https://openalex.org/C70061542","wikidata":"https://www.wikidata.org/wiki/Q989016","display_name":"Distributed database","level":2,"score":0.26350000500679016},{"id":"https://openalex.org/C5038329","wikidata":"https://www.wikidata.org/wiki/Q1142907","display_name":"Core network","level":2,"score":0.262800008058548},{"id":"https://openalex.org/C169851745","wikidata":"https://www.wikidata.org/wiki/Q1331985","display_name":"Overlay network","level":3,"score":0.26190000772476196},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.25850000977516174},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.25540000200271606},{"id":"https://openalex.org/C28180684","wikidata":"https://www.wikidata.org/wiki/Q4080983","display_name":"Memory safety","level":3,"score":0.2535000145435333},{"id":"https://openalex.org/C4373008","wikidata":"https://www.wikidata.org/wiki/Q513349","display_name":"File server","level":2,"score":0.2517000138759613}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3772052.3772225","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3772052.3772225","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM Symposium on Cloud Computing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3772052.3772225","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3772052.3772225","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM Symposium on Cloud Computing","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.6576095223426819,"display_name":"Industry, innovation and infrastructure"}],"awards":[{"id":"https://openalex.org/G5138956579","display_name":null,"funder_award_id":"2241818","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":10,"referenced_works":["https://openalex.org/W2157614013","https://openalex.org/W2234600635","https://openalex.org/W2281291499","https://openalex.org/W2298436731","https://openalex.org/W3189830509","https://openalex.org/W3213182453","https://openalex.org/W4290990894","https://openalex.org/W4386385105","https://openalex.org/W4400237214","https://openalex.org/W4401176521"],"related_works":[],"abstract_inverted_index":{"Distributed":[0],"deep":[1],"learning":[2],"(DDL)":[3],"requires":[4],"a":[5,30,33,91,112],"great":[6],"investment":[7],"in":[8,69],"cloud":[9],"infrastructure,":[10],"including":[11],"accelerated":[12],"compute":[13],"nodes":[14,116],"and":[15,49,103,121,134],"networking":[16],"hardware":[17],"capable":[18],"of":[19,114],"supporting":[20],"high-performance":[21],"networking,":[22],"e.g.,":[23],"Remote":[24],"Direct":[25],"Memory":[26],"Access":[27],"(RDMA).":[28],"When":[29,51],"host":[31,53,58,72],"running":[32],"DDL":[34,101],"application":[35],"becomes":[36],"unreachable,":[37],"the":[38,52,65,71,74,143],"cost":[39,84],"can":[40,151],"be":[41,152],"high":[42],"as":[43],"application-level":[44],"failure":[45],"recovery":[46],"is":[47,54,61,85,97],"slow":[48],"disruptive.":[50],"unreachable":[55],"due":[56],"to":[57,73,100,146],"failure,":[59],"this":[60,83],"unavoidable;":[62],"however,":[63],"when":[64],"network":[66,78,94],"components":[67],"involved":[68],"attaching":[70],"core":[75],"data":[76,132],"center":[77],"fail,":[79],"we":[80],"argue":[81],"that":[82,96,136,150],"avoidable.":[86],"This":[87],"paper":[88],"introduces":[89],"THORN-ML,":[90],"hardware-offloaded":[92],"resilient":[93],"architecture":[95],"completely":[98],"transparent":[99],"applications":[102,127],"works":[104],"with":[105,117,125],"commodity":[106],"hardware.":[107],"We":[108],"evaluate":[109],"THORN-ML":[110,137],"on":[111],"cluster":[113],"5":[115],"Nvidia":[118],"A100":[119],"GPUs":[120],"Mellanox":[122],"ConnectX-5":[123],"NICs,":[124],"several":[126],"leveraging":[128],"model":[129],"parallelism":[130],"and/or":[131],"parallelism,":[133],"find":[135],"reduces":[138],"disruption":[139],"from":[140],"minutes":[141],"(impacting":[142,148],"whole":[144],"cluster)":[145],"milliseconds":[147],"packets":[149],"re-transmitted).":[153]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-01-14T00:00:00"}
