{"id":"https://openalex.org/W7123842876","doi":"https://doi.org/10.48550/arxiv.2601.06103","title":"The Impact of Post-training on Data Contamination","display_name":"The Impact of Post-training on Data Contamination","publication_year":2026,"publication_date":"2026-01-03","ids":{"openalex":"https://openalex.org/W7123842876","doi":"https://doi.org/10.48550/arxiv.2601.06103"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.06103","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.06103","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.06103","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027139032","display_name":"Muhammed Yusuf Kocyigit","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kocyigit, Muhammed Yusuf","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5122925254","display_name":"Caglar Yildirim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yildirim, Caglar","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5027139032"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.18870000541210175,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.18870000541210175,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.13249999284744263,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.1307000070810318,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/contamination","display_name":"Contamination","score":0.7616000175476074},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5924000144004822},{"id":"https://openalex.org/keywords/memorization","display_name":"Memorization","score":0.46160000562667847},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.4099000096321106},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.3409999907016754},{"id":"https://openalex.org/keywords/leakage","display_name":"Leakage (economics)","score":0.3091000020503998}],"concepts":[{"id":"https://openalex.org/C112570922","wikidata":"https://www.wikidata.org/wiki/Q60528603","display_name":"Contamination","level":2,"score":0.7616000175476074},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5924000144004822},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5139999985694885},{"id":"https://openalex.org/C30038468","wikidata":"https://www.wikidata.org/wiki/Q4354775","display_name":"Memorization","level":2,"score":0.46160000562667847},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.4099000096321106},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.3409999907016754},{"id":"https://openalex.org/C39432304","wikidata":"https://www.wikidata.org/wiki/Q188847","display_name":"Environmental science","level":0,"score":0.32100000977516174},{"id":"https://openalex.org/C2777042071","wikidata":"https://www.wikidata.org/wiki/Q6509304","display_name":"Leakage (economics)","level":2,"score":0.3091000020503998},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.30660000443458557},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.30230000615119934},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.29420000314712524},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.28119999170303345},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.27300000190734863},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.25780001282691956},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.2502000033855438}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.06103","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.06103","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.06103","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.06103","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,55],"present":[1],"a":[2],"controlled":[3],"study":[4],"of":[5,26,36,47,126],"how":[6],"dataset":[7],"contamination":[8,127,194],"interacts":[9],"with":[10,81,115,143],"the":[11,43,58,122,139,152,191],"post-training":[12,72,89,197],"stages":[13],"now":[14],"standard":[15],"in":[16],"large":[17],"language":[18],"model":[19],"training":[20],"pipelines.":[21],"Starting":[22],"from":[23],"clean":[24,61],"checkpoints":[25],"Qwen2.5":[27],"(0.5B/1.5B)":[28],"and":[29,38,60,67,77,98,136,198],"Gemma3":[30],"(1B/4B),":[31],"we":[32,101],"inject":[33],"five":[34],"copies":[35],"GSM8K":[37],"MBPP":[39],"test":[40],"items":[41],"into":[42,184],"first":[44],"2B":[45],"tokens":[46,121],"an":[48],"otherwise":[49],"25B":[50,120],"token":[51],"extended":[52],"pre-training":[53,66],"dataset.":[54],"then":[56],"compare":[57],"contaminated":[59,153],"models":[62,175,181],"both":[63],"immediately":[64],"after":[65,69],"again":[68],"two":[70],"popular":[71],"methods:":[73],"supervised":[74],"fine-tuning":[75],"(SFT)":[76],"reinforcement":[78],"learning":[79],"(RL)":[80],"group":[82],"relative":[83],"policy":[84],"optimization":[85],"(GRPO).":[86],"The":[87],"applied":[88],"steps":[90],"do":[91],"not":[92,204],"have":[93],"any":[94],"contamination.":[95],"Across":[96],"math":[97],"coding":[99],"benchmarks,":[100],"find":[102],"three":[103],"consistent":[104],"patterns:":[105],"(i)":[106],"Contamination":[107],"causes":[108],"performance":[109,124,159],"spikes":[110],"that":[111,200],"are":[112],"gradually":[113],"diminished":[114],"continued":[116],"pre-training.":[117],"After":[118],"even":[119],"apparent":[123],"inflation":[125],"can":[128,206],"become":[129],"close":[130],"to":[131],"zero.":[132],"(ii)":[133],"Both":[134],"SFT":[135,147],"GRPO":[137,156,180],"resurface":[138],"leaked":[140],"information,":[141],"but":[142],"different":[144],"external":[145],"validity:":[146],"inflates":[148,158],"scores":[149],"only":[150],"on":[151,160],"tasks,":[154],"whereas":[155],"also":[157],"uncontaminated":[161],"counterparts":[162],"(GSMPlus,":[163],"HumanEval).":[164],"(iii)":[165],"Model":[166],"scale":[167],"amplifies":[168],"these":[169],"tendencies,":[170],"larger":[171,179],"Supervised":[172],"Fine":[173],"Tuned":[174],"memorize":[176],"more,":[177],"while":[178],"translate":[182],"leakage":[183],"more":[185],"generalizable":[186],"capabilities.":[187],"Our":[188],"results":[189],"underscore":[190],"need":[192],"for":[193],"audits":[195],"\\emph{after}":[196],"suggest":[199],"RL-based":[201],"post-training,":[202],"although":[203],"immune,":[205],"help":[207],"alleviate":[208],"contamination-related":[209],"over-estimation":[210],"problems.":[211]},"counts_by_year":[],"updated_date":"2026-01-14T23:44:37.837170","created_date":"2026-01-14T00:00:00"}
