{"id":"https://openalex.org/W4416297968","doi":"https://doi.org/10.1145/3772356.3772384","title":"FlowPulse: Catching Network Failures in ML Clusters","display_name":"FlowPulse: Catching Network Failures in ML Clusters","publication_year":2025,"publication_date":"2025-11-17","ids":{"openalex":"https://openalex.org/W4416297968","doi":"https://doi.org/10.1145/3772356.3772384"},"language":null,"primary_location":{"id":"doi:10.1145/3772356.3772384","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3772356.3772384","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3772356.3772384","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 24th ACM Workshop on Hot Topics in Networks","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3772356.3772384","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120601376","display_name":"Jakob Krebs","orcid":"https://orcid.org/0000-0001-8335-5680"},"institutions":[{"id":"https://openalex.org/I174306211","display_name":"Technion \u2013 Israel Institute of Technology","ror":"https://ror.org/03qryx823","country_code":"IL","type":"education","lineage":["https://openalex.org/I174306211"]}],"countries":["IL"],"is_corresponding":true,"raw_author_name":"Jakob Krebs","raw_affiliation_strings":["Technion - Israel Institute of Technology, Haifa, Israel"],"raw_orcid":"https://orcid.org/0000-0001-8335-5680","affiliations":[{"raw_affiliation_string":"Technion - Israel Institute of Technology, Haifa, Israel","institution_ids":["https://openalex.org/I174306211"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120406557","display_name":"Dimitry Gavrilenko","orcid":"https://orcid.org/0009-0001-6165-3481"},"institutions":[{"id":"https://openalex.org/I174306211","display_name":"Technion \u2013 Israel Institute of Technology","ror":"https://ror.org/03qryx823","country_code":"IL","type":"education","lineage":["https://openalex.org/I174306211"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Dimitry Gavrilenko","raw_affiliation_strings":["Technion - Israel Institute of Technology, Haifa, Israel"],"raw_orcid":"https://orcid.org/0009-0001-6165-3481","affiliations":[{"raw_affiliation_string":"Technion - Israel Institute of Technology, Haifa, Israel","institution_ids":["https://openalex.org/I174306211"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018245644","display_name":"Daniel Amir","orcid":"https://orcid.org/0000-0002-6294-9604"},"institutions":[{"id":"https://openalex.org/I174306211","display_name":"Technion \u2013 Israel Institute of Technology","ror":"https://ror.org/03qryx823","country_code":"IL","type":"education","lineage":["https://openalex.org/I174306211"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Daniel Amir","raw_affiliation_strings":["Technion - Israel Institute of Technology, Haifa, Israel"],"raw_orcid":"https://orcid.org/0000-0002-6294-9604","affiliations":[{"raw_affiliation_string":"Technion - Israel Institute of Technology, Haifa, Israel","institution_ids":["https://openalex.org/I174306211"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078064243","display_name":"Shir Landau Feibish","orcid":"https://orcid.org/0000-0003-3998-8645"},"institutions":[{"id":"https://openalex.org/I91203450","display_name":"University of Haifa","ror":"https://ror.org/02f009v59","country_code":"IL","type":"education","lineage":["https://openalex.org/I91203450"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Shir Landau Feibish","raw_affiliation_strings":["University of Haifa, Haifa, Israel"],"raw_orcid":"https://orcid.org/0000-0003-3998-8645","affiliations":[{"raw_affiliation_string":"University of Haifa, Haifa, Israel","institution_ids":["https://openalex.org/I91203450"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5022593894","display_name":"Mark Silberstein","orcid":"https://orcid.org/0000-0001-9659-068X"},"institutions":[{"id":"https://openalex.org/I174306211","display_name":"Technion \u2013 Israel Institute of Technology","ror":"https://ror.org/03qryx823","country_code":"IL","type":"education","lineage":["https://openalex.org/I174306211"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Mark Silberstein","raw_affiliation_strings":["Technion - Israel Institute of Technology, Haifa, Israel"],"raw_orcid":"https://orcid.org/0000-0001-9659-068X","affiliations":[{"raw_affiliation_string":"Technion - Israel Institute of Technology, Haifa, Israel","institution_ids":["https://openalex.org/I174306211"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5120601376"],"corresponding_institution_ids":["https://openalex.org/I174306211"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.3843921,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"139","last_page":"148"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.19359999895095825,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.19359999895095825,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11005","display_name":"Radiation Effects in Electronics","score":0.1469999998807907,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.11969999969005585,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/traverse","display_name":"Traverse","score":0.829200029373169},{"id":"https://openalex.org/keywords/network-packet","display_name":"Network packet","score":0.5532000064849854},{"id":"https://openalex.org/keywords/routing","display_name":"Routing (electronic design automation)","score":0.5353999733924866},{"id":"https://openalex.org/keywords/flow-network","display_name":"Flow network","score":0.5164999961853027},{"id":"https://openalex.org/keywords/flow","display_name":"Flow (mathematics)","score":0.3961000144481659},{"id":"https://openalex.org/keywords/fault","display_name":"Fault (geology)","score":0.38609999418258667},{"id":"https://openalex.org/keywords/fault-detection-and-isolation","display_name":"Fault detection and isolation","score":0.3248000144958496}],"concepts":[{"id":"https://openalex.org/C176809094","wikidata":"https://www.wikidata.org/wiki/Q15401496","display_name":"Traverse","level":2,"score":0.829200029373169},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6083999872207642},{"id":"https://openalex.org/C158379750","wikidata":"https://www.wikidata.org/wiki/Q214111","display_name":"Network packet","level":2,"score":0.5532000064849854},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.5353999733924866},{"id":"https://openalex.org/C114809511","wikidata":"https://www.wikidata.org/wiki/Q1412924","display_name":"Flow network","level":2,"score":0.5164999961853027},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.5029000043869019},{"id":"https://openalex.org/C38349280","wikidata":"https://www.wikidata.org/wiki/Q1434290","display_name":"Flow (mathematics)","level":2,"score":0.3961000144481659},{"id":"https://openalex.org/C175551986","wikidata":"https://www.wikidata.org/wiki/Q47089","display_name":"Fault (geology)","level":2,"score":0.38609999418258667},{"id":"https://openalex.org/C152745839","wikidata":"https://www.wikidata.org/wiki/Q5438153","display_name":"Fault detection and isolation","level":3,"score":0.3248000144958496},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3237000107765198},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.3158000111579895},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.31529998779296875},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3009999990463257},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.29409998655319214},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.29269999265670776},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.26660001277923584},{"id":"https://openalex.org/C159631557","wikidata":"https://www.wikidata.org/wiki/Q1546066","display_name":"Networking hardware","level":2,"score":0.2605000138282776},{"id":"https://openalex.org/C113508815","wikidata":"https://www.wikidata.org/wiki/Q193446","display_name":"Packet switching","level":3,"score":0.2565000057220459},{"id":"https://openalex.org/C24856439","wikidata":"https://www.wikidata.org/wiki/Q352483","display_name":"Adaptive routing","level":5,"score":0.2540999948978424},{"id":"https://openalex.org/C167391956","wikidata":"https://www.wikidata.org/wiki/Q1401211","display_name":"Fault model","level":3,"score":0.25099998712539673}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3772356.3772384","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3772356.3772384","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3772356.3772384","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 24th ACM Workshop on Hot Topics in Networks","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3772356.3772384","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3772356.3772384","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3772356.3772384","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 24th ACM Workshop on Hot Topics in Networks","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3688088978","display_name":null,"funder_award_id":"980/21","funder_id":"https://openalex.org/F4320322252","funder_display_name":"Israel Science Foundation"},{"id":"https://openalex.org/G6781317376","display_name":null,"funder_award_id":"1998/22","funder_id":"https://openalex.org/F4320322252","funder_display_name":"Israel Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320322252","display_name":"Israel Science Foundation","ror":"https://ror.org/04sazxf24"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416297968.pdf","grobid_xml":"https://content.openalex.org/works/W4416297968.grobid-xml"},"referenced_works_count":6,"referenced_works":["https://openalex.org/W2101471701","https://openalex.org/W2160054705","https://openalex.org/W3097013252","https://openalex.org/W4249923717","https://openalex.org/W4401176521","https://openalex.org/W4401176799"],"related_works":[],"abstract_inverted_index":{"Network":[0],"hardware":[1],"faults":[2,34,42,66],"are":[3,15,43],"inevitable":[4],"in":[5,12,45],"massive":[6],"scale-out":[7],"ML":[8],"training":[9],"clusters.":[10],"Networks":[11],"such":[13,65],"systems":[14],"inherently":[16],"designed":[17],"for":[18],"resiliency,":[19],"routing":[20],"around":[21],"faulty":[22],"components":[23],"as":[24,26],"long":[25],"a":[27,57],"fault":[28],"is":[29],"detected.":[30],"Unfortunately,":[31],"some":[32],"silent":[33,41],"evade":[35],"detection.":[36],"Notably,":[37],"the":[38],"effects":[39],"of":[40,56],"amplified":[44],"modern":[46],"production":[47],"networks":[48],"that":[49],"deploy":[50],"per-packet":[51],"load":[52],"balancing,":[53],"because":[54],"packets":[55],"single":[58],"flow":[59],"traverse":[60],"many":[61],"network":[62],"paths,":[63],"making":[64],"particularly":[67],"hard":[68],"to":[69],"localize.":[70]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-11-17T00:00:00"}
