{"id":"https://openalex.org/W4408029480","doi":"https://doi.org/10.1145/3710848.3710869","title":"WeiPipe: Weight Pipeline Parallelism for Communication-Effective Long-Context Large Model Training","display_name":"WeiPipe: Weight Pipeline Parallelism for Communication-Effective Long-Context Large Model Training","publication_year":2025,"publication_date":"2025-02-28","ids":{"openalex":"https://openalex.org/W4408029480","doi":"https://doi.org/10.1145/3710848.3710869"},"language":"en","primary_location":{"id":"doi:10.1145/3710848.3710869","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3710848.3710869","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3710848.3710869","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3710848.3710869","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101982106","display_name":"Junfeng Lin","orcid":"https://orcid.org/0009-0008-5214-9858"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Junfeng Lin","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101924544","display_name":"Ziming Liu","orcid":"https://orcid.org/0009-0009-3355-6770"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Ziming Liu","raw_affiliation_strings":["National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100658705","display_name":"Yang You","orcid":"https://orcid.org/0000-0003-2816-4384"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Yang You","raw_affiliation_strings":["National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100384886","display_name":"Jun Wang","orcid":"https://orcid.org/0000-0003-2177-9757"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jun Wang","raw_affiliation_strings":["CETHIK Group Co. Ltd., Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"CETHIK Group Co. Ltd., Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101476783","display_name":"Weihao Zhang","orcid":"https://orcid.org/0000-0002-9301-8538"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Weihao Zhang","raw_affiliation_strings":["Lynxi Technologies Co. Ltd, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Lynxi Technologies Co. Ltd, Beijing, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100675943","display_name":"Rong Zhao","orcid":"https://orcid.org/0000-0002-2320-0326"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rong Zhao","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5101982106"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":3.9571,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.92716902,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"225","last_page":"238"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12814","display_name":"Gaussian Processes and Bayesian Inference","score":0.9825999736785889,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.9825000166893005,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8228722810745239},{"id":"https://openalex.org/keywords/parallelism","display_name":"Parallelism (grammar)","score":0.7479713559150696},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.7458735704421997},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5987707376480103},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.57991623878479},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5691182613372803},{"id":"https://openalex.org/keywords/pipeline-transport","display_name":"Pipeline transport","score":0.42270126938819885},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.4079514741897583},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.3310931324958801},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.17139384150505066},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.06940081715583801}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8228722810745239},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.7479713559150696},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.7458735704421997},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5987707376480103},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.57991623878479},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5691182613372803},{"id":"https://openalex.org/C175309249","wikidata":"https://www.wikidata.org/wiki/Q725864","display_name":"Pipeline transport","level":2,"score":0.42270126938819885},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.4079514741897583},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3310931324958801},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.17139384150505066},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.06940081715583801},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C87717796","wikidata":"https://www.wikidata.org/wiki/Q146326","display_name":"Environmental engineering","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3710848.3710869","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3710848.3710869","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3710848.3710869","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3710848.3710869","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3710848.3710869","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3710848.3710869","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4408029480.pdf"},"referenced_works_count":12,"referenced_works":["https://openalex.org/W2612387305","https://openalex.org/W2991040477","https://openalex.org/W3129831491","https://openalex.org/W3132977829","https://openalex.org/W3177828909","https://openalex.org/W3206832494","https://openalex.org/W4283830961","https://openalex.org/W4294433898","https://openalex.org/W4295312788","https://openalex.org/W4386348101","https://openalex.org/W4386768656","https://openalex.org/W4390872297"],"related_works":["https://openalex.org/W4380433113","https://openalex.org/W4386072068","https://openalex.org/W252339960","https://openalex.org/W2390529043","https://openalex.org/W2378320433","https://openalex.org/W2358343511","https://openalex.org/W2071821326","https://openalex.org/W2051877971","https://openalex.org/W1970117064","https://openalex.org/W1787170397"],"abstract_inverted_index":{"Training":[0],"large":[1],"language":[2],"models":[3],"(LLMs)":[4],"has":[5],"become":[6,32],"increasingly":[7],"expensive":[8],"due":[9,46],"to":[10,47,72,139,159,199],"the":[11,48,126],"rapid":[12],"expansion":[13],"in":[14,95,144,156,195],"model":[15,146],"size.":[16],"Pipeline":[17],"parallelism":[18,64,162,167],"is":[19],"a":[20,73,82,96,153],"widely":[21],"used":[22],"distributed":[23],"training":[24],"technique.":[25],"However,":[26],"as":[27],"LLMs":[28],"with":[29,168,177],"larger":[30],"context":[31],"prevalent":[33],"and":[34,53,80,90,123,142,163,181,185],"memory":[35],"optimization":[36],"techniques":[37],"advance,":[38],"traditional":[39],"PP":[40],"methods":[41],"encounter":[42],"greater":[43,193],"communication":[44,78,105,121],"challenges":[45],"increased":[49],"size":[50],"of":[51,55,114,134],"activations":[52],"gradients":[54,92],"activations.":[56],"To":[57],"address":[58],"this":[59],"issue,":[60],"we":[61],"introduce":[62],"weight-pipeline":[63],"(WeiPipe)":[65],"that":[66],"transitions":[67],"from":[68],"an":[69],"activation-passing":[70],"pipeline":[71,97,161],"weight-passing":[74],"pipeline.":[75],"WeiPipe":[76,99,115,190],"reduces":[77],"costs":[79],"achieves":[81],"more":[83],"balanced":[84],"utilization":[85],"by":[86],"transmitting":[87],"only":[88],"weights":[89],"their":[91],"between":[93],"workers":[94],"manner.":[98],"does":[100],"not":[101],"rely":[102],"on":[103,137],"collective":[104],"primitives,":[106],"thus":[107],"ensuring":[108],"scalability.":[109],"We":[110],"present":[111],"four":[112],"variations":[113],"parallelism,":[116],"including":[117,148,172],"WeiPipe-Interleave,":[118,135],"which":[119],"emphasizes":[120],"efficiency,":[122],"WeiPipe-zero-bubble,":[124],"discussing":[125],"potential":[127],"for":[128],"minimal":[129],"bubble":[130],"ratios.":[131],"Our":[132],"implementation":[133],"performed":[136],"up":[138],"32":[140],"GPUs":[141],"tested":[143],"various":[145],"configurations,":[147],"large-context":[149],"LLM":[150],"training,":[151],"demonstrates":[152],"significant":[154],"improvement":[155],"throughput":[157],"compared":[158,198],"state-of-the-art":[160],"fully":[164],"sharded":[165],"data":[166],"different":[169],"underlying":[170],"infrastructures,":[171],"NVLink":[173],"connections":[174],"within":[175,183],"cluster":[176,184],"Ethernet":[178,186],"among":[179,187],"cluster,":[180],"PCIe":[182],"cluster.":[188],"Additionally,":[189],"also":[191],"shows":[192],"scalability":[194],"communication-constrained":[196],"scenarios":[197],"state-of-art":[200],"strategies.":[201]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":1}],"updated_date":"2026-02-25T08:12:03.925757","created_date":"2025-10-10T00:00:00"}
