{"id":"https://openalex.org/W4415338159","doi":"https://doi.org/10.48550/arxiv.2509.25810","title":"Learning to Reason as Action Abstractions with Scalable Mid-Training RL","display_name":"Learning to Reason as Action Abstractions with Scalable Mid-Training RL","publication_year":2025,"publication_date":"2025-09-30","ids":{"openalex":"https://openalex.org/W4415338159","doi":"https://doi.org/10.48550/arxiv.2509.25810"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2509.25810","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.25810","pdf_url":"https://arxiv.org/pdf/2509.25810","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2509.25810","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024816616","display_name":"Shenao Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Shenao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102073967","display_name":"Donghan Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Donghan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111345651","display_name":"Yihao Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Yihao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025877787","display_name":"Bowen Jin","orcid":"https://orcid.org/0000-0003-1295-2829"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Bowen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101934111","display_name":"Zhaoran Wang","orcid":"https://orcid.org/0000-0002-6617-4842"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhaoran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058487388","display_name":"John Peebles","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peebles, John","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5111126191","display_name":"Zirui Wang","orcid":"https://orcid.org/0009-0002-1941-517X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zirui","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5024816616"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.930400013923645,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.930400013923645,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9138000011444092,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.9107999801635742,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6345999836921692},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.6068000197410583},{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.5609999895095825},{"id":"https://openalex.org/keywords/intuition","display_name":"Intuition","score":0.49140000343322754},{"id":"https://openalex.org/keywords/subspace-topology","display_name":"Subspace topology","score":0.4327999949455261},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4196000099182129},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4133000075817108},{"id":"https://openalex.org/keywords/base","display_name":"Base (topology)","score":0.40389999747276306},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.3790999948978424}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7865999937057495},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6345999836921692},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.6068000197410583},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.5609999895095825},{"id":"https://openalex.org/C132010649","wikidata":"https://www.wikidata.org/wiki/Q189222","display_name":"Intuition","level":2,"score":0.49140000343322754},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46459999680519104},{"id":"https://openalex.org/C32834561","wikidata":"https://www.wikidata.org/wiki/Q660730","display_name":"Subspace topology","level":2,"score":0.4327999949455261},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4196000099182129},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4133000075817108},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4124999940395355},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.40389999747276306},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.3790999948978424},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.36500000953674316},{"id":"https://openalex.org/C106189395","wikidata":"https://www.wikidata.org/wiki/Q176789","display_name":"Markov decision process","level":3,"score":0.35190001130104065},{"id":"https://openalex.org/C166109690","wikidata":"https://www.wikidata.org/wiki/Q4677422","display_name":"Action selection","level":3,"score":0.34630000591278076},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3409999907016754},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.336899995803833},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.31859999895095825},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.30649998784065247},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.30230000615119934},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.296099990606308},{"id":"https://openalex.org/C28761237","wikidata":"https://www.wikidata.org/wiki/Q7805321","display_name":"Time horizon","level":2,"score":0.2842999994754791},{"id":"https://openalex.org/C163239763","wikidata":"https://www.wikidata.org/wiki/Q5153637","display_name":"Common value auction","level":2,"score":0.28360000252723694},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.28029999136924744},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.27649998664855957},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.27390000224113464},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.2703999876976013},{"id":"https://openalex.org/C176248197","wikidata":"https://www.wikidata.org/wiki/Q458526","display_name":"Probably approximately correct learning","level":4,"score":0.26460000872612}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2509.25810","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.25810","pdf_url":"https://arxiv.org/pdf/2509.25810","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2509.25810","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.25810","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2509.25810","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.25810","pdf_url":"https://arxiv.org/pdf/2509.25810","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4415338159.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"excel":[3],"with":[4],"reinforcement":[5],"learning":[6],"(RL),":[7],"but":[8],"fully":[9],"unlocking":[10],"this":[11,40],"potential":[12],"requires":[13],"a":[14,23,160,167],"mid-training":[15,19,50,81,118,162],"stage.":[16],"An":[17],"effective":[18,121,130],"phase":[20],"should":[21],"identify":[22],"compact":[24,127],"set":[25],"of":[26,80,89,137,142,198],"useful":[27],"actions":[28],"and":[29,67,94,128,172,212,216,223,233,243],"enable":[30],"fast":[31],"selection":[32],"among":[33],"them":[34],"through":[35],"online":[36,112],"RL.":[37],"We":[38],"formalize":[39],"intuition":[41],"by":[42,175,184,214],"presenting":[43],"the":[44,61,68,87,90,102,123,129,135,140,187,196,207,220,224],"first":[45],"theoretical":[46],"result":[47],"on":[48,97,150,186,191,210,239],"how":[49],"shapes":[51,86],"post-training:":[52],"it":[53,174],"characterizes":[54],"an":[55],"action":[56,143],"subspace":[57],"that":[58,106,117],"minimizes":[59],"both":[60],"value":[62],"approximation":[63],"error":[64,70],"from":[65],"pruning":[66,83],"RL":[69,92,98],"during":[71],"subsequent":[72],"planning.":[73],"Our":[74],"analysis":[75],"reveals":[76],"two":[77],"key":[78],"determinants":[79],"effectiveness:":[82],"efficiency,":[84],"which":[85,100,105],"prior":[88],"initial":[91],"policy,":[93],"its":[95],"impact":[96],"convergence,":[99],"governs":[101],"extent":[103],"to":[104],"policy":[107],"can":[108],"be":[109],"improved":[110],"via":[111,181],"interactions.":[113],"These":[114],"results":[115],"suggest":[116],"is":[119,126,132],"most":[120],"when":[122],"decision":[124],"space":[125,141],"horizon":[131],"short,":[133],"highlighting":[134],"importance":[136],"operating":[138],"in":[139,237],"abstractions":[144],"rather":[145],"than":[146],"primitive":[147],"actions.":[148],"Building":[149],"these":[151],"insights,":[152],"we":[153,165],"propose":[154],"Reasoning":[155],"as":[156],"Action":[157],"Abstractions":[158],"(RA3),":[159],"scalable":[161],"algorithm.":[163],"Specifically,":[164],"derive":[166],"sequential":[168],"variational":[169],"lower":[170],"bound":[171],"optimize":[173],"iteratively":[176],"discovering":[177],"temporally-consistent":[178],"latent":[179],"structures":[180],"RL,":[182],"followed":[183],"fine-tuning":[185],"bootstrapped":[188],"data.":[189],"Experiments":[190],"code":[192],"generation":[193],"tasks":[194],"demonstrate":[195],"effectiveness":[197],"our":[199],"approach.":[200],"Across":[201],"multiple":[202],"base":[203,221],"models,":[204],"RA3":[205,229],"improves":[206],"average":[208],"performance":[209,236],"HumanEval":[211],"MBPP":[213],"8":[215],"4":[217],"points":[218],"over":[219],"model":[222],"next-token":[225],"prediction":[226],"baseline.":[227],"Furthermore,":[228],"achieves":[230],"faster":[231],"convergence":[232],"higher":[234],"asymptotic":[235],"RLVR":[238],"HumanEval+,":[240],"MBPP+,":[241],"LiveCodeBench,":[242],"Codeforces.":[244]},"counts_by_year":[],"updated_date":"2026-03-11T14:59:36.786465","created_date":"2025-10-19T00:00:00"}
