{"id":"https://openalex.org/W4412888460","doi":"https://doi.org/10.18653/v1/2025.findings-acl.397","title":"Towards A Better Initial Policy Model For Scalable Long-CoT Reinforcement Learning","display_name":"Towards A Better Initial Policy Model For Scalable Long-CoT Reinforcement Learning","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4412888460","doi":"https://doi.org/10.18653/v1/2025.findings-acl.397"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2025.findings-acl.397","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.397","pdf_url":"https://aclanthology.org/2025.findings-acl.397.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.findings-acl.397.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5002310167","display_name":"Bofei Gao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bofei Gao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042432324","display_name":"Yejie Wang","orcid":"https://orcid.org/0009-0000-7361-1679"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yejie Wang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045257498","display_name":"Yibo Miao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yibo Miao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079129705","display_name":"Ruoyu Wu","orcid":"https://orcid.org/0000-0003-2649-6086"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ruoyu Wu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101633968","display_name":"Fei Song","orcid":"https://orcid.org/0000-0002-8825-6494"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feifan Song","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049419584","display_name":"Longhui Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Longhui Yu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102771621","display_name":"Tianyu Liu","orcid":"https://orcid.org/0000-0002-9098-5944"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tianyu Liu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5021459300","display_name":"Baobao Chang","orcid":"https://orcid.org/0000-0003-2824-6750"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Baobao Chang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18894987,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"7652","last_page":"7665"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13553","display_name":"Age of Information Optimization","score":0.6992999911308289,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13553","display_name":"Age of Information Optimization","score":0.6992999911308289,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.8610330820083618},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6952351927757263},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6097565293312073},{"id":"https://openalex.org/keywords/reinforcement","display_name":"Reinforcement","score":0.48146024346351624},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.34412527084350586},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.11636000871658325},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.09639790654182434},{"id":"https://openalex.org/keywords/structural-engineering","display_name":"Structural engineering","score":0.06226947903633118}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.8610330820083618},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6952351927757263},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6097565293312073},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.48146024346351624},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34412527084350586},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.11636000871658325},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.09639790654182434},{"id":"https://openalex.org/C66938386","wikidata":"https://www.wikidata.org/wiki/Q633538","display_name":"Structural engineering","level":1,"score":0.06226947903633118}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.findings-acl.397","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.397","pdf_url":"https://aclanthology.org/2025.findings-acl.397.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.findings-acl.397","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-acl.397","pdf_url":"https://aclanthology.org/2025.findings-acl.397.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: ACL 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G138806403","display_name":null,"funder_award_id":"61876004","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G655826466","display_name":"\u57fa\u4e8e\u8bed\u8a00\u8ba4\u77e5\u673a\u7406\u7684\u6c49\u8bed\u6846\u67b6\u8bed\u4e49\u8ba1\u7b97\u7814\u7a76","funder_award_id":"61936012","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4412888460.pdf","grobid_xml":"https://content.openalex.org/works/W4412888460.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W4310083477","https://openalex.org/W2328553770","https://openalex.org/W2920061524","https://openalex.org/W1977959518","https://openalex.org/W2038908348","https://openalex.org/W2107890255","https://openalex.org/W2106552856"],"abstract_inverted_index":{"Long-CoT":[0],"reasoning":[1,71],"combined":[2],"with":[3,66],"reinforcement":[4],"learning":[5],"for":[6,41,84,122],"large":[7],"language":[8],"models":[9,87],"demonstrates":[10],"remarkable":[11],"performance":[12,27],"and":[13,70,78,95,102,117,148],"scalability.However,":[14],"we":[15,51,73,112],"observe":[16],"that":[17],"the":[18,25,31,58,81,89,105,133,137],"initial":[19,45,59,86],"policy":[20,46],"model":[21,60,135],"could":[22],"significantly":[23],"influence":[24],"final":[26],"as":[28,30],"well":[29],"token":[32,96],"efficiency.Additionally,":[33],"there":[34],"is":[35],"a":[36,43,53,62,75,99,114,118,124],"lack":[37],"of":[38,64,80,91,142],"systematic":[39,115],"guidelines":[40],"obtaining":[42],"better":[44,125],"model.To":[47],"bridge":[48],"this":[49],"gap,":[50],"initiate":[52],"comprehensive":[54],"investigation":[55],"by":[56,139],"activating":[57],"using":[61],"variety":[63],"datasets":[65],"different":[67,85],"data":[68],"volumes":[69],"patterns.Then,":[72],"conduct":[74],"thorough":[76],"analysis":[77],"comparison":[79],"RL":[82,126],"process":[83],"from":[88],"perspectives":[90],"upper":[92],"bounds,":[93],"diversity,":[94],"efficiency,":[97],"providing":[98],"deeper":[100],"understanding":[101],"insight":[103],"into":[104],"long-CoT":[106],"RL.Based":[107],"on":[108,132],"our":[109,145],"empirical":[110],"results,":[111],"propose":[113],"guideline":[116],"novel":[119],"Re-RFT":[120],"method":[121],"constructing":[123],"start":[127],"point.Our":[128],"experiment":[129],"results":[130],"based":[131],"14B":[134],"surpass":[136],"DeepSeek-R1-Distill-Qwen-14B":[138],"an":[140],"average":[141],"4.6%,":[143],"demonstrating":[144],"approach's":[146],"effectiveness":[147],"superiority.":[149]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
