{"id":"https://openalex.org/W4407130444","doi":"https://doi.org/10.1109/icnp61940.2024.10858546","title":"Kspeed: Beating I/O Bottlenecks of Data Provisioning for RDMA Training Clusters","display_name":"Kspeed: Beating I/O Bottlenecks of Data Provisioning for RDMA Training Clusters","publication_year":2024,"publication_date":"2024-10-28","ids":{"openalex":"https://openalex.org/W4407130444","doi":"https://doi.org/10.1109/icnp61940.2024.10858546"},"language":"en","primary_location":{"id":"doi:10.1109/icnp61940.2024.10858546","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icnp61940.2024.10858546","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 32nd International Conference on Network Protocols (ICNP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100743206","display_name":"Jianbo Dong","orcid":"https://orcid.org/0000-0003-0939-8943"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jianbo Dong","raw_affiliation_strings":["Alibaba Group,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Beijing,China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091038854","display_name":"Hao Qi","orcid":"https://orcid.org/0009-0007-8795-5262"},"institutions":[{"id":"https://openalex.org/I156087764","display_name":"University of California, Merced","ror":"https://ror.org/00d9ah105","country_code":"US","type":"education","lineage":["https://openalex.org/I156087764"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hao Qi","raw_affiliation_strings":["University of California, Merced,Department of Computer Science,Merced,CA,USA"],"affiliations":[{"raw_affiliation_string":"University of California, Merced,Department of Computer Science,Merced,CA,USA","institution_ids":["https://openalex.org/I156087764"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088109976","display_name":"Tianjing Xu","orcid":"https://orcid.org/0000-0002-7293-430X"},"institutions":[{"id":"https://openalex.org/I874390226","display_name":"China Construction Bank","ror":"https://ror.org/00xkrw816","country_code":"CN","type":"other","lineage":["https://openalex.org/I874390226"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tianjing Xu","raw_affiliation_strings":["China Construction Bank Operations Data Center,Beijing,China"],"affiliations":[{"raw_affiliation_string":"China Construction Bank Operations Data Center,Beijing,China","institution_ids":["https://openalex.org/I874390226"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100386581","display_name":"Xiaoli Liu","orcid":"https://orcid.org/0000-0002-2585-7559"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoli Liu","raw_affiliation_strings":["Alibaba Group,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Beijing,China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100344294","display_name":"Wei Chen","orcid":"https://orcid.org/0000-0001-5090-9915"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chen Wei","raw_affiliation_strings":["Alibaba Group,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Beijing,China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067475933","display_name":"Ruini Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rongyao Wang","raw_affiliation_strings":["Alibaba Group,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Beijing,China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067748041","display_name":"Xiaoyi Lu","orcid":"https://orcid.org/0000-0001-7581-8905"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoyi Lu","raw_affiliation_strings":["Alibaba Group,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Beijing,China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029024939","display_name":"Zheng Cao","orcid":"https://orcid.org/0000-0002-1565-3683"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zheng Cao","raw_affiliation_strings":["Alibaba Group,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Beijing,China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5006224078","display_name":"Binzhang Fu","orcid":"https://orcid.org/0009-0008-1213-0554"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Binzhang Fu","raw_affiliation_strings":["Alibaba Group,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Beijing,China","institution_ids":["https://openalex.org/I45928872"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5100743206"],"corresponding_institution_ids":["https://openalex.org/I45928872"],"apc_list":null,"apc_paid":null,"fwci":0.3653,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.66474227,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"12"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.6653000116348267,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.6653000116348267,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.5672000050544739,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/remote-direct-memory-access","display_name":"Remote direct memory access","score":0.937976062297821},{"id":"https://openalex.org/keywords/provisioning","display_name":"Provisioning","score":0.8177676796913147},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7026711702346802},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5102698802947998},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.4199366867542267},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.3917772173881531},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.32402127981185913},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.07634249329566956}],"concepts":[{"id":"https://openalex.org/C130795937","wikidata":"https://www.wikidata.org/wiki/Q2561570","display_name":"Remote direct memory access","level":2,"score":0.937976062297821},{"id":"https://openalex.org/C172191483","wikidata":"https://www.wikidata.org/wiki/Q1071806","display_name":"Provisioning","level":2,"score":0.8177676796913147},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7026711702346802},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5102698802947998},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.4199366867542267},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.3917772173881531},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.32402127981185913},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.07634249329566956},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icnp61940.2024.10858546","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icnp61940.2024.10858546","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 32nd International Conference on Network Protocols (ICNP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W2072566913","https://openalex.org/W2083842231","https://openalex.org/W2194775991","https://openalex.org/W2464708700","https://openalex.org/W2604319603","https://openalex.org/W2886189612","https://openalex.org/W2900042580","https://openalex.org/W2963426391","https://openalex.org/W3016395792","https://openalex.org/W3047537431","https://openalex.org/W3093933627","https://openalex.org/W3205803342","https://openalex.org/W3206418153","https://openalex.org/W4281657584","https://openalex.org/W4285503860","https://openalex.org/W4288083516","https://openalex.org/W4312080192","https://openalex.org/W6634769544","https://openalex.org/W6710076213","https://openalex.org/W6713134421","https://openalex.org/W6738811965","https://openalex.org/W6753584990","https://openalex.org/W6766978945","https://openalex.org/W6774103200","https://openalex.org/W6780671031","https://openalex.org/W6784425352","https://openalex.org/W6996886134"],"related_works":["https://openalex.org/W3090586438","https://openalex.org/W2782433361","https://openalex.org/W2416075414","https://openalex.org/W2475302168","https://openalex.org/W4392747727","https://openalex.org/W4366999533","https://openalex.org/W2134569538","https://openalex.org/W3135214639","https://openalex.org/W2594055038","https://openalex.org/W2754465584"],"abstract_inverted_index":{"The":[0],"rapidly-increasing":[1],"computing":[2],"power":[3],"of":[4,36,49,60],"GPUs":[5,64,80],"has":[6,26],"rendered":[7],"the":[8,47,57,82,148,157,172,187,198,231,242],"I/O":[9],"subsystem":[10],"a":[11,33,119,152,212],"bottleneck":[12],"for":[13,32,124],"distributed":[14],"deep":[15],"learning":[16],"(DL)":[17],"training.":[18],"Currently,":[19],"substantial":[20],"data":[21,62,109,121,131,163,173,182,227],"preprocessing":[22,132],"work":[23],"(e.g.,":[24],"decoding)":[25],"to":[27,63,87,91,135,150,167,186,192,202,248],"be":[28,136],"conducted":[29],"on":[30,56,211],"CPUs":[31,55,75,176],"wide":[34],"range":[35],"training":[37,50,72,127,188],"scenarios":[38],"such":[39],"as":[40,241],"computer":[41],"vision":[42],"(CV)":[43],"and":[44,81,103,113,144,179,208,235],"audio.":[45],"Unfortunately,":[46],"involvement":[48],"nodes'":[51],"host":[52,88,142,169],"memory":[53,89,143,191,205],"and/or":[54],"critical":[58],"path":[59],"loading":[61,110,228],"incurs":[65,99],"significant":[66],"GPU":[67,190,194,243],"stalls":[68],"in":[69,147],"modern":[70],"RDMA":[71,126,200],"clusters,":[73],"because":[74],"are":[76],"much":[77],"slower":[78],"than":[79],"connection":[83],"from":[84,93,164,246],"PCIe":[85],"switches":[86],"tends":[90],"suffer":[92],"incast":[94],"problems.":[95],"Moreover,":[96],"this":[97],"also":[98],"high":[100],"CPU":[101,145],"usage":[102],"resource":[104],"contention,":[105],"which":[106],"consequently":[107],"causes":[108],"performance":[111,229],"variation":[112],"stragglers.":[114],"This":[115],"paper":[116],"presents":[117],"KSpeed,":[118],"novel":[120],"provisioning":[122],"framework":[123],"large-scale":[125],"clusters.":[128],"As":[129],"many":[130],"tasks":[133],"need":[134],"done":[137],"by":[138,174],"CPUs,":[139],"KSpeed":[140,196,217,237],"organizes":[141],"resources":[146],"cluster":[149,214],"build":[151],"disaggregated":[153],"memory/CPU":[154],"pool,":[155],"where":[156],"nodes":[158],"can":[159],"read":[160],"raw":[161],"input":[162],"backend":[165],"storage":[166],"their":[168,175],"memory,":[170],"preprocess":[171],"if":[177],"necessary,":[178],"write":[180],"cached/preprocessed":[181],"(on":[183],"demand)":[184],"directly":[185],"workers'":[189],"minimize":[193],"stalls.":[195],"leverages":[197],"multi-rail":[199],"network":[201],"eliminate":[203],"unnecessary":[204],"copies,":[206],"interference,":[207],"congestion.":[209],"Evaluation":[210],"96-GPU":[213],"shows":[215],"that":[216],"delivers":[218],"<tex":[219],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[220],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$5.4":[221],"\\times":[222],"\\sim":[223],"100":[224],"\\times$</tex>":[225],"higher":[226],"over":[230],"state-of-the-art":[232],"designs":[233],"(DPP":[234],"Alluxio).":[236],"achieves":[238],"near-linear":[239],"scalability":[240],"number":[244],"increases":[245],"8":[247],"512.":[249]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-12-19T19:40:27.379048","created_date":"2025-10-10T00:00:00"}
