{"id":"https://openalex.org/W4415288811","doi":"https://doi.org/10.48550/arxiv.2503.16219","title":"Reinforcement Learning for Reasoning in Small LLMs: What Works and What Doesn't","display_name":"Reinforcement Learning for Reasoning in Small LLMs: What Works and What Doesn't","publication_year":2025,"publication_date":"2025-03-20","ids":{"openalex":"https://openalex.org/W4415288811","doi":"https://doi.org/10.48550/arxiv.2503.16219"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2503.16219","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.16219","pdf_url":"https://arxiv.org/pdf/2503.16219","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2503.16219","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120038257","display_name":"Quy-Anh Dang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dang, Quy-Anh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Ngo, Chris","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ngo, Chris","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10551","display_name":"Scheduling and Optimization Algorithms","score":0.6176999807357788,"subfield":{"id":"https://openalex.org/subfields/2209","display_name":"Industrial and Manufacturing Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10551","display_name":"Scheduling and Optimization Algorithms","score":0.6176999807357788,"subfield":{"id":"https://openalex.org/subfields/2209","display_name":"Industrial and Manufacturing Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11182","display_name":"Auction Theory and Applications","score":0.5436000227928162,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.73089998960495},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.6317999958992004},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.5931000113487244},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.42179998755455017},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.40119999647140503},{"id":"https://openalex.org/keywords/automated-reasoning","display_name":"Automated reasoning","score":0.37619999051094055},{"id":"https://openalex.org/keywords/case-based-reasoning","display_name":"Case-based reasoning","score":0.34119999408721924}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.73089998960495},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6452999711036682},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.6317999958992004},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.5931000113487244},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5364000201225281},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.42179998755455017},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.413100004196167},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.40119999647140503},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.37619999051094055},{"id":"https://openalex.org/C20162079","wikidata":"https://www.wikidata.org/wiki/Q1151406","display_name":"Case-based reasoning","level":2,"score":0.34119999408721924},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.3249000012874603},{"id":"https://openalex.org/C3018790387","wikidata":"https://www.wikidata.org/wiki/Q869010","display_name":"Hybrid learning","level":2,"score":0.3151000142097473},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.31439998745918274},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.31360000371932983},{"id":"https://openalex.org/C159032336","wikidata":"https://www.wikidata.org/wiki/Q2488768","display_name":"Non-monotonic logic","level":2,"score":0.2797999978065491},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.26750001311302185},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.2572000026702881},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.2556000053882599}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2503.16219","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.16219","pdf_url":"https://arxiv.org/pdf/2503.16219","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2503.16219","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2503.16219","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2503.16219","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.16219","pdf_url":"https://arxiv.org/pdf/2503.16219","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Enhancing":[0],"the":[1,26,61,141],"reasoning":[2,34,74,90],"capabilities":[3],"of":[4,28,120,143],"large":[5],"language":[6],"models":[7],"(LLMs)":[8],"typically":[9],"relies":[10],"on":[11,39,48],"massive":[12],"computational":[13],"resources":[14],"and":[15,68,84,101,112,131,160,169],"extensive":[16],"datasets,":[17],"limiting":[18],"accessibility":[19],"for":[20,122,146,173],"resource-constrained":[21],"settings.":[22],"Our":[23,86],"study":[24],"investigates":[25],"potential":[27],"reinforcement":[29],"learning":[30],"(RL)":[31],"to":[32,80,99,118,153],"improve":[33],"in":[35,177],"small":[36,147],"LLMs,":[37,148],"focusing":[38],"a":[40,70,113,150,171],"1.5-billion-parameter":[41],"model,":[42],"DeepSeek-R1-Distill-Qwen-1.5B,":[43],"under":[44],"strict":[45],"constraints:":[46],"training":[47,115],"4":[49],"NVIDIA":[50],"A40":[51],"GPUs":[52],"(48":[53],"GB":[54],"VRAM":[55],"each)":[56],"within":[57],"24":[58],"hours.":[59],"Adapting":[60],"Group":[62],"Relative":[63],"Policy":[64],"Optimization":[65],"(GRPO)":[66],"algorithm":[67],"curating":[69],"compact,":[71],"high-quality":[72],"mathematical":[73],"dataset,":[75],"we":[76],"conducted":[77],"three":[78],"experiments":[79],"explore":[81],"model":[82],"behavior":[83],"performance.":[85],"results":[87],"demonstrate":[88],"rapid":[89],"gains":[91],"-":[92,107],"e.g.,":[93],"AMC23":[94],"accuracy":[95],"rising":[96],"from":[97],"63%":[98],"80%":[100],"AIME24":[102],"reaching":[103],"46.7%,":[104],"surpassing":[105],"o1-preview":[106],"using":[108],"only":[109],"7,000":[110],"samples":[111],"$42":[114],"cost,":[116],"compared":[117],"thousands":[119],"dollars":[121],"baseline":[123],"models.":[124],"However,":[125],"challenges":[126],"such":[127],"as":[128,162],"optimization":[129],"instability":[130],"length":[132],"constraints":[133],"emerged":[134],"with":[135],"prolonged":[136],"training.":[137],"These":[138],"findings":[139],"highlight":[140],"efficacy":[142],"RL-based":[144],"fine-tuning":[145],"offering":[149],"cost-effective":[151],"alternative":[152],"large-scale":[154],"approaches.":[155],"We":[156],"release":[157],"our":[158],"code":[159],"datasets":[161],"open-source":[163],"resources,":[164],"providing":[165],"insights":[166],"into":[167],"trade-offs":[168],"laying":[170],"foundation":[172],"scalable,":[174],"reasoning-capable":[175],"LLMs":[176],"resource-limited":[178],"environments.":[179],"All":[180],"are":[181],"available":[182],"at":[183],"https://github.com/knoveleng/open-rs.":[184]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-17T00:00:00"}
