{"id":"https://openalex.org/W4415338159","doi":"https://doi.org/10.48550/arxiv.2509.25810","title":"Learning to Reason as Action Abstractions with Scalable Mid-Training RL","display_name":"Learning to Reason as Action Abstractions with Scalable Mid-Training RL","publication_year":2025,"publication_date":"2025-09-30","ids":{"openalex":"https://openalex.org/W4415338159","doi":"https://doi.org/10.48550/arxiv.2509.25810"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2509.25810","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.25810","pdf_url":"https://arxiv.org/pdf/2509.25810","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository","raw_type":"text"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2509.25810","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024816616","display_name":"Shenao Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Shenao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102073967","display_name":"Donghan Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Donghan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111345651","display_name":"Yihao Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Yihao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025877787","display_name":"Bowen Jin","orcid":"https://orcid.org/0000-0003-1295-2829"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Bowen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101934111","display_name":"Zhaoran Wang","orcid":"https://orcid.org/0000-0002-6617-4842"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhaoran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058487388","display_name":"John Peebles","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peebles, John","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5111126191","display_name":"Zirui Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zirui","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.930400013923645,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.930400013923645,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9138000011444092,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.9107999801635742,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[],"concepts":[],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2509.25810","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.25810","pdf_url":"https://arxiv.org/pdf/2509.25810","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository","raw_type":"text"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null},{"id":"doi:10.48550/arxiv.2509.25810","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.25810","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository","raw_type":"article"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2509.25810","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.25810","pdf_url":"https://arxiv.org/pdf/2509.25810","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository","raw_type":"text"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null},"sustainable_development_goals":[],"grants":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"excel":[3],"with":[4],"reinforcement":[5],"learning":[6],"(RL),":[7],"but":[8],"fully":[9],"unlocking":[10],"this":[11,40],"potential":[12],"requires":[13],"a":[14,23,160,167],"mid-training":[15,19,50,81,118,162],"stage.":[16],"An":[17],"effective":[18,121,130],"phase":[20],"should":[21],"identify":[22],"compact":[24,127],"set":[25],"of":[26,80,89,137,142,198],"useful":[27],"actions":[28],"and":[29,67,94,128,172,212,216,223,233,243],"enable":[30],"fast":[31],"selection":[32],"among":[33],"them":[34],"through":[35],"online":[36,112],"RL.":[37],"We":[38],"formalize":[39],"intuition":[41],"by":[42,175,184,214],"presenting":[43],"the":[44,61,68,87,90,102,123,129,135,140,187,196,207,220,224],"first":[45],"theoretical":[46],"result":[47],"on":[48,97,150,186,191,210,239],"how":[49],"shapes":[51,86],"post-training:":[52],"it":[53,174],"characterizes":[54],"an":[55],"action":[56,143],"subspace":[57],"that":[58,106,117],"minimizes":[59],"both":[60],"value":[62],"approximation":[63],"error":[64,70],"from":[65],"pruning":[66,83],"RL":[69,92,98],"during":[71],"subsequent":[72],"planning.":[73],"Our":[74],"analysis":[75],"reveals":[76],"two":[77],"key":[78],"determinants":[79],"effectiveness:":[82],"efficiency,":[84],"which":[85,100,105],"prior":[88],"initial":[91],"policy,":[93],"its":[95],"impact":[96],"convergence,":[99],"governs":[101],"extent":[103],"to":[104],"policy":[107],"can":[108],"be":[109],"improved":[110],"via":[111,181],"interactions.":[113],"These":[114],"results":[115],"suggest":[116],"is":[119,126,132],"most":[120],"when":[122],"decision":[124],"space":[125,141],"horizon":[131],"short,":[133],"highlighting":[134],"importance":[136],"operating":[138],"in":[139,237],"abstractions":[144],"rather":[145],"than":[146],"primitive":[147],"actions.":[148],"Building":[149],"these":[151],"insights,":[152],"we":[153,165],"propose":[154],"Reasoning":[155],"as":[156],"Action":[157],"Abstractions":[158],"(RA3),":[159],"scalable":[161],"algorithm.":[163],"Specifically,":[164],"derive":[166],"sequential":[168],"variational":[169],"lower":[170],"bound":[171],"optimize":[173],"iteratively":[176],"discovering":[177],"temporally-consistent":[178],"latent":[179],"structures":[180],"RL,":[182],"followed":[183],"fine-tuning":[185],"bootstrapped":[188],"data.":[189],"Experiments":[190],"code":[192],"generation":[193],"tasks":[194],"demonstrate":[195],"effectiveness":[197],"our":[199],"approach.":[200],"Across":[201],"multiple":[202],"base":[203,221],"models,":[204],"RA3":[205,229],"improves":[206],"average":[208],"performance":[209,236],"HumanEval":[211],"MBPP":[213],"8":[215],"4":[217],"points":[218],"over":[219],"model":[222],"next-token":[225],"prediction":[226],"baseline.":[227],"Furthermore,":[228],"achieves":[230],"faster":[231],"convergence":[232],"higher":[234],"asymptotic":[235],"RLVR":[238],"HumanEval+,":[240],"MBPP+,":[241],"LiveCodeBench,":[242],"Codeforces.":[244]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-19T00:00:00"}