{"id":"https://openalex.org/W7137805725","doi":"https://doi.org/10.48550/arxiv.2603.14420","title":"Data Darwinism Part II: DataEvolve -- AI can Autonomously Evolve Pretraining Data Curation","display_name":"Data Darwinism Part II: DataEvolve -- AI can Autonomously Evolve Pretraining Data Curation","publication_year":2026,"publication_date":"2026-03-15","ids":{"openalex":"https://openalex.org/W7137805725","doi":"https://doi.org/10.48550/arxiv.2603.14420"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.14420","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14420","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.14420","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123950109","display_name":"Tiantian Mi","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Mi, Tiantian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129731260","display_name":"Dongming Shan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shan, Dongming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129704117","display_name":"Zhen huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Zhen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129751337","display_name":"Yiwei Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Yiwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123983080","display_name":"Muhang Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Muhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129692036","display_name":"Yuxuan Qiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiao, Yuxuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129701188","display_name":"Yixiu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yixiu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129729659","display_name":"Chenyang Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Chenyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129697013","display_name":"Pengfei Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Pengfei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5123950109"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12377","display_name":"Digital Humanities and Scholarship","score":0.20340000092983246,"subfield":{"id":"https://openalex.org/subfields/1208","display_name":"Literature and Literary Theory"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12377","display_name":"Digital Humanities and Scholarship","score":0.20340000092983246,"subfield":{"id":"https://openalex.org/subfields/1208","display_name":"Literature and Literary Theory"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.10849999636411667,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11937","display_name":"Research Data Management Practices","score":0.08489999920129776,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/raw-data","display_name":"Raw data","score":0.49790000915527344},{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.48100000619888306},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4788999855518341},{"id":"https://openalex.org/keywords/iterative-and-incremental-development","display_name":"Iterative and incremental development","score":0.4424000084400177},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4162999987602234},{"id":"https://openalex.org/keywords/hierarchy","display_name":"Hierarchy","score":0.40939998626708984},{"id":"https://openalex.org/keywords/heuristics","display_name":"Heuristics","score":0.3903000056743622}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7664999961853027},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.599399983882904},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5223000049591064},{"id":"https://openalex.org/C132964779","wikidata":"https://www.wikidata.org/wiki/Q2110223","display_name":"Raw data","level":2,"score":0.49790000915527344},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.48100000619888306},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4788999855518341},{"id":"https://openalex.org/C143587482","wikidata":"https://www.wikidata.org/wiki/Q1543216","display_name":"Iterative and incremental development","level":2,"score":0.4424000084400177},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4162999987602234},{"id":"https://openalex.org/C31170391","wikidata":"https://www.wikidata.org/wiki/Q188619","display_name":"Hierarchy","level":2,"score":0.40939998626708984},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.3903000056743622},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.35040000081062317},{"id":"https://openalex.org/C105902424","wikidata":"https://www.wikidata.org/wiki/Q1197129","display_name":"Evolutionary computation","level":2,"score":0.34310001134872437},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3179999887943268},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3059999942779541},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.30469998717308044},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.3019999861717224},{"id":"https://openalex.org/C159149176","wikidata":"https://www.wikidata.org/wiki/Q14489129","display_name":"Evolutionary algorithm","level":2,"score":0.2924000024795532},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.26249998807907104}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.14420","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14420","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.14420","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14420","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Data":[0],"Darwinism":[1],"(Part":[2],"I)":[3],"established":[4],"a":[5,29,59,72,93,128,147,172],"ten-level":[6],"hierarchy":[7],"for":[8,28,242],"data":[9,18,88,167,244],"processing,":[10],"showing":[11],"that":[12,21,74],"stronger":[13],"processing":[14],"can":[15,62],"unlock":[16],"greater":[17],"value.":[19],"However,":[20],"work":[22],"relied":[23],"on":[24,106,161,187,198],"manually":[25],"designed":[26],"strategies":[27,63,76,151,196,227],"single":[30],"category.":[31,157],"Modern":[32],"pretraining":[33],"corpora":[34],"comprise":[35],"hundreds":[36],"of":[37,124],"heterogeneous":[38],"categories":[39,138],"spanning":[40,139],"domains":[41],"and":[42,111,127,170,182,204,240],"content":[43],"types,":[44],"each":[45,87],"demanding":[46],"specialized":[47],"treatment.":[48],"At":[49],"this":[50],"scale,":[51],"manual":[52,84],"strategy":[53,129,236],"design":[54,237],"becomes":[55],"prohibitive.":[56],"This":[57],"raises":[58],"key":[60],"question:":[61],"evolve":[64,78],"in":[65,92],"an":[66,121],"automated":[67],"way?":[68],"We":[69],"introduce":[70],"DataEvolve,":[71],"framework":[73],"enables":[75],"to":[77,136],"through":[79,120,153],"iterative":[80,222],"optimization":[81],"rather":[82],"than":[83],"design.":[85],"For":[86],"category,":[89],"DataEvolve":[90,144],"operates":[91],"closed":[94],"evolutionary":[95,235],"loop:":[96],"it":[97],"identifies":[98],"quality":[99],"issues,":[100],"generates":[101],"candidate":[102],"strategies,":[103],"executes":[104],"them":[105],"sampled":[107],"data,":[108],"evaluates":[109],"results,":[110],"refines":[112],"approaches":[113],"across":[114,133,176],"generations.":[115],"The":[116],"process":[117],"accumulates":[118],"knowledge":[119],"experience":[122],"pool":[123,130],"discovered":[125],"issues":[126],"tracking":[131],"performance":[132],"iterations.":[134],"Applied":[135],"8":[137],"672B":[140],"tokens":[141],"from":[142,216],"Nemotron-CC,":[143],"produces":[145],"Darwin-CC,":[146],"504B-token":[148],"dataset":[149],"with":[150,184,207],"evolved":[152,195],"30":[154],"iterations":[155],"per":[156],"Training":[158],"3B":[159],"models":[160],"500B":[162],"tokens,":[163],"Darwin-CC":[164],"outperforms":[165],"raw":[166],"(+3.96":[168],"points)":[169],"achieves":[171],"44.13":[173],"average":[174],"score":[175],"18":[177],"benchmarks,":[178],"surpassing":[179],"DCLM,":[180],"Ultra-FineWeb,":[181],"FineWeb-Edu,":[183],"strong":[185],"gains":[186],"knowledge-intensive":[188],"tasks":[189],"such":[190],"as":[191,238],"MMLU.":[192],"Analysis":[193],"shows":[194],"converge":[197],"cleaning-focused":[199],"approaches:":[200],"targeted":[201],"noise":[202],"removal":[203],"format":[205],"normalization":[206],"domain-aware":[208],"preservation,":[209],"echoing":[210],"the":[211],"L4":[212],"(Generative":[213],"Refinement)":[214],"principles":[215],"Part":[217],"I.":[218],"Ablation":[219],"studies":[220],"confirm":[221],"evolution":[223],"is":[224],"essential:":[225],"optimized":[226],"outperform":[228],"suboptimal":[229],"ones":[230],"by":[231],"2.93":[232],"points,":[233],"establishing":[234],"feasible":[239],"necessary":[241],"pretraining-scale":[243],"curation.":[245]},"counts_by_year":[],"updated_date":"2026-03-18T06:31:55.123368","created_date":"2026-03-18T00:00:00"}
