{"id":"https://openalex.org/W7161691765","doi":"https://doi.org/10.48550/arxiv.2605.18750","title":"A Readiness-Driven Runtime for Pipeline-Parallel Training under Runtime Variability","display_name":"A Readiness-Driven Runtime for Pipeline-Parallel Training under Runtime Variability","publication_year":2026,"publication_date":"2026-05-18","ids":{"openalex":"https://openalex.org/W7161691765","doi":"https://doi.org/10.48550/arxiv.2605.18750"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.18750","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18750","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.18750","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136494271","display_name":"Ruitao Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liu, Ruitao","raw_affiliation_strings":["Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136503419","display_name":"Xinyang Tian","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tian, Xinyang","raw_affiliation_strings":["Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054950055","display_name":"S Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chen, Shuo","raw_affiliation_strings":["Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009381590","display_name":"Tingrui Zhang","orcid":"https://orcid.org/0000-0001-5740-4960"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhang, Tingrui","raw_affiliation_strings":["Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136461803","display_name":"Guang Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yang, Guang","raw_affiliation_strings":["Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133047475","display_name":"Alan Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I4210110438","display_name":"Aoptix Technologies (United States)","ror":"https://ror.org/01xhkgx67","country_code":"US","type":"company","lineage":["https://openalex.org/I4210110438"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhao, Alan","raw_affiliation_strings":["Scitix AI"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Scitix AI","institution_ids":["https://openalex.org/I4210110438"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5136469037","display_name":"Wei Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xu, Wei","raw_affiliation_strings":["Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.6855000257492065,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.6855000257492065,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.05249999836087227,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.04129999876022339,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6650999784469604},{"id":"https://openalex.org/keywords/executable","display_name":"Executable","score":0.6376000046730042},{"id":"https://openalex.org/keywords/schedule","display_name":"Schedule","score":0.6053000092506409},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5785999894142151},{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.5702000260353088},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5015000104904175},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.4869000017642975},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4738999903202057}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8680999875068665},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6650999784469604},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.6376000046730042},{"id":"https://openalex.org/C68387754","wikidata":"https://www.wikidata.org/wiki/Q7271585","display_name":"Schedule","level":2,"score":0.6053000092506409},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5785999894142151},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.5702000260353088},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5015000104904175},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.4869000017642975},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4738999903202057},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.439300000667572},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4226999878883362},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.39750000834465027},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.36899998784065247},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.3650999963283539},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.3303999900817871},{"id":"https://openalex.org/C2780870223","wikidata":"https://www.wikidata.org/wiki/Q1004415","display_name":"Runtime system","level":2,"score":0.32249999046325684},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.3222000002861023},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.32010000944137573},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.30469998717308044},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.30239999294281006},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2955999970436096},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.28279998898506165},{"id":"https://openalex.org/C2779019669","wikidata":"https://www.wikidata.org/wiki/Q25203946","display_name":"Asynchrony (computer programming)","level":3,"score":0.26820001006126404},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.18750","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18750","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.18750","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18750","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Pipeline":[0,68],"parallelism":[1],"is":[2,55],"a":[3,70,87,90,103,137],"key":[4],"technique":[5],"for":[6,47,73,107,124,130],"scaling":[7],"large-model":[8],"training,":[9],"but":[10],"modern":[11],"workloads":[12,148,175],"exhibit":[13],"runtime":[14,72],"variability":[15],"in":[16,136],"computation":[17],"and":[18,62,127,141,146,176],"communication.":[19],"Existing":[20],"pipeline":[21,158],"systems":[22],"typically":[23],"consume":[24],"static,":[25],"profiled,":[26],"or":[27],"adaptively":[28],"generated":[29],"schedules":[30,79],"as":[31,89,102],"pre-committed":[32,42],"execution":[33],"orders.":[34],"When":[35],"realized":[36],"task":[37],"readiness":[38],"diverges":[39],"from":[40],"the":[41,100,164,188,193],"order,":[43],"stages":[44,93],"may":[45],"wait":[46,95],"not-yet-ready":[48],"work":[49,54],"even":[50],"though":[51],"other":[52],"executable":[53],"available,":[56],"creating":[57],"stage":[58],"misalignment,":[59],"idle":[60],"bubbles,":[61],"reduced":[63],"utilization.":[64],"We":[65,133],"present":[66],"Runtime-Readiness-First":[67],"(RRFP),":[69],"readiness-driven":[71],"pipeline-parallel":[74],"training.":[75],"RRFP":[76,116,135,154,167,186],"changes":[77],"how":[78],"are":[80],"consumed":[81],"at":[82,149],"runtime:":[83],"instead":[84],"of":[85],"treating":[86],"schedule":[88,101],"sequence":[91],"that":[92],"must":[94],"to":[96,151,170,178,200],"follow,":[97],"it":[98,143],"treats":[99],"non-binding":[104],"hint":[105,191],"order":[106],"ranking":[108],"currently":[109],"ready":[110],"work.":[111],"To":[112],"support":[113],"this":[114],"model,":[115],"combines":[117],"message-driven":[118],"asynchronous":[119],"communication,":[120],"lightweight":[121],"tensor-parallel":[122],"coordination":[123],"collective":[125],"consistency,":[126],"ready-set":[128],"arbitration":[129],"low-overhead":[131],"dispatch.":[132],"implement":[134],"Megatron-based":[138],"training":[139,204],"framework":[140],"evaluate":[142],"on":[144,173,180],"language-only":[145,174],"multimodal":[147,181],"up":[150,169,177,199],"128":[152],"GPUs.":[153],"improves":[155],"over":[156],"fixed-order":[157],"baselines":[159],"across":[160],"all":[161],"settings.":[162],"Using":[163],"BFW":[165],"hint,":[166],"achieves":[168],"1.77$\\times$":[171],"speedup":[172],"2.77$\\times$":[179],"workloads.":[182],"In":[183],"cross-framework":[184],"comparisons,":[185],"with":[187],"default":[189],"BF":[190],"outperforms":[192],"faster":[194],"available":[195],"external":[196],"system":[197],"by":[198],"1.84$\\times$":[201],"while":[202],"preserving":[203],"correctness.":[205]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-20T00:00:00"}
