{"id":"https://openalex.org/W4415428370","doi":"https://doi.org/10.3233/faia251297","title":"Don\u2019t Stop Pre-Training Small Language Models for Continual Enhancement of Reasoning","display_name":"Don\u2019t Stop Pre-Training Small Language Models for Continual Enhancement of Reasoning","publication_year":2025,"publication_date":"2025-10-21","ids":{"openalex":"https://openalex.org/W4415428370","doi":"https://doi.org/10.3233/faia251297"},"language":null,"primary_location":{"id":"doi:10.3233/faia251297","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia251297","pdf_url":null,"source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},"type":"book-chapter","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.3233/faia251297","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100404043","display_name":"Qing Li","orcid":"https://orcid.org/0000-0001-9612-4718"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qing Li","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014759537","display_name":"Qibin Zheng","orcid":"https://orcid.org/0000-0002-3989-377X"},"institutions":[{"id":"https://openalex.org/I4210096250","display_name":"Beijing Institute of Big Data Research","ror":"https://ror.org/00s1sz824","country_code":"CN","type":"facility","lineage":["https://openalex.org/I20231570","https://openalex.org/I37796252","https://openalex.org/I4210096250"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qibin Zheng","raw_affiliation_strings":["Advanced Institute of Big Data, Beijing, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Advanced Institute of Big Data, Beijing, Beijing, China","institution_ids":["https://openalex.org/I4210096250"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101982037","display_name":"Yi Liu","orcid":"https://orcid.org/0000-0001-7041-694X"},"institutions":[{"id":"https://openalex.org/I4210096250","display_name":"Beijing Institute of Big Data Research","ror":"https://ror.org/00s1sz824","country_code":"CN","type":"facility","lineage":["https://openalex.org/I20231570","https://openalex.org/I37796252","https://openalex.org/I4210096250"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi Liu","raw_affiliation_strings":["Advanced Institute of Big Data, Beijing, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Advanced Institute of Big Data, Beijing, Beijing, China","institution_ids":["https://openalex.org/I4210096250"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5109280499","display_name":"Xingchun Diao","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingchun Diao","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.46875182,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9236000180244446,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9236000180244446,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9225999712944031,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.6248000264167786},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.44110000133514404},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.41290000081062317},{"id":"https://openalex.org/keywords/automated-reasoning","display_name":"Automated reasoning","score":0.36059999465942383},{"id":"https://openalex.org/keywords/reasoning-system","display_name":"Reasoning system","score":0.33410000801086426},{"id":"https://openalex.org/keywords/language-understanding","display_name":"Language understanding","score":0.3319999873638153},{"id":"https://openalex.org/keywords/non-monotonic-logic","display_name":"Non-monotonic logic","score":0.321399986743927}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7261999845504761},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.6248000264167786},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5990999937057495},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4553000032901764},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.44110000133514404},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.41290000081062317},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.36059999465942383},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.33410000801086426},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.3319999873638153},{"id":"https://openalex.org/C159032336","wikidata":"https://www.wikidata.org/wiki/Q2488768","display_name":"Non-monotonic logic","level":2,"score":0.321399986743927},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.29589998722076416},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2946999967098236},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.2870999872684479},{"id":"https://openalex.org/C76969082","wikidata":"https://www.wikidata.org/wiki/Q486902","display_name":"Mathematical model","level":2,"score":0.28540000319480896},{"id":"https://openalex.org/C36964233","wikidata":"https://www.wikidata.org/wiki/Q7920942","display_name":"Verbal reasoning","level":3,"score":0.2766000032424927},{"id":"https://openalex.org/C86827895","wikidata":"https://www.wikidata.org/wiki/Q7098582","display_name":"Opportunistic reasoning","level":4,"score":0.2678999900817871},{"id":"https://openalex.org/C20162079","wikidata":"https://www.wikidata.org/wiki/Q1151406","display_name":"Case-based reasoning","level":2,"score":0.26409998536109924}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.3233/faia251297","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia251297","pdf_url":null,"source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"}],"best_oa_location":{"id":"doi:10.3233/faia251297","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia251297","pdf_url":null,"source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,83],"investigate":[1],"the":[2,52,71,92,95,157],"continual":[3,63],"enhancement":[4],"of":[5,54,75,94,159],"mathematical":[6,49,72],"reasoning":[7,21,50,73,132,154,169],"abilities":[8],"in":[9,168],"small":[10],"language":[11,16],"models":[12,17,173],"(SLMs).":[13],"While":[14],"large":[15,80],"(LLMs)":[18],"demonstrate":[19,156],"impressive":[20],"performance,":[22,102,170],"their":[23],"deployment":[24],"is":[25],"often":[26],"constrained":[27],"by":[28],"substantial":[29,166],"computational":[30],"costs.":[31],"Existing":[32],"approaches":[33],"to":[34,69],"improving":[35],"SLMs":[36,76,161],"mainly":[37],"rely":[38],"on":[39,79,91,151,174],"knowledge":[40],"distillation":[41],"from":[42],"costly":[43],"teacher":[44,81],"LLMs,":[45],"which":[46],"typically":[47],"improves":[48],"at":[51],"expense":[53],"general":[55,135,180],"capabilities.":[56,181],"In":[57],"this":[58],"work,":[59],"we":[60,103,137],"show":[61],"that":[62,86,116,143],"pre-training":[64],"(CPT)":[65],"has":[66],"strong":[67],"potential":[68],"enhance":[70,130],"ability":[74],"without":[77],"relying":[78],"models.":[82],"also":[84],"find":[85],"its":[87],"effectiveness":[88,158],"critically":[89],"depends":[90],"quality":[93],"training":[96,119],"data.":[97],"To":[98,128],"maximize":[99],"efficiency":[100],"and":[101,125],"propose":[104],"Dual-Metric":[105],"Selection":[106],"for":[107],"Continual":[108],"Pre-training":[109],"(DRIFT),":[110],"a":[111,139],"novel":[112],"data":[113,120,141],"selection":[114],"strategy":[115],"identifies":[117],"optimal":[118],"through":[121],"task-aligned":[122],"loss":[123],"differences":[124],"distributional":[126],"regularization.":[127],"further":[129],"task-specific":[131],"while":[133,177],"preserving":[134,179],"capabilities,":[136],"introduce":[138],"metadata-aware":[140],"mixture":[142],"integrates":[144],"diverse":[145],"sources":[146],"during":[147],"CPT.":[148],"Extensive":[149],"experiments":[150],"multiple":[152],"arithmetic":[153],"benchmarks":[155],"DRIFT:":[160],"trained":[162],"with":[163],"DRIFT":[164],"achieve":[165],"gains":[167],"surpassing":[171],"larger":[172],"specific":[175],"tasks,":[176],"largely":[178]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-24T00:00:00"}
