{"id":"https://openalex.org/W7147601973","doi":"https://doi.org/10.48550/arxiv.2603.27164","title":"daVinci-LLM:Towards the Science of Pretraining","display_name":"daVinci-LLM:Towards the Science of Pretraining","publication_year":2026,"publication_date":"2026-03-28","ids":{"openalex":"https://openalex.org/W7147601973","doi":"https://doi.org/10.48550/arxiv.2603.27164"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.27164","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.27164","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.27164","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129751337","display_name":"Yiwei Qin","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Qin, Yiwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129701188","display_name":"Yixiu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yixiu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123950109","display_name":"Tiantian Mi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mi, Tiantian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123983080","display_name":"Muhang Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Muhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132650101","display_name":"Zhen Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Zhen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124996236","display_name":"Weiye Si","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Si, Weiye","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124893805","display_name":"Pengrui Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Pengrui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132571457","display_name":"Siyuan Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Siyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113971923","display_name":"Xia Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Xia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132664436","display_name":"Liming Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Liming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132724785","display_name":"Ye Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Ye","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129469545","display_name":"Jinlong Hou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hou, Jinlong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132671955","display_name":"Qipeng Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Qipeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132570358","display_name":"Yu Qiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiao, Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132631814","display_name":"Pengfei Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Pengfei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":15,"corresponding_author_ids":["https://openalex.org/A5129751337"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.08309999853372574,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.08309999853372574,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10703","display_name":"Business Process Modeling and Analysis","score":0.06069999933242798,"subfield":{"id":"https://openalex.org/subfields/1404","display_name":"Management Information Systems"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.05480000004172325,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/openness-to-experience","display_name":"Openness to experience","score":0.5670999884605408},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.5613999962806702},{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.5252000093460083},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.49950000643730164},{"id":"https://openalex.org/keywords/taxonomy","display_name":"Taxonomy (biology)","score":0.37040001153945923},{"id":"https://openalex.org/keywords/dimension","display_name":"Dimension (graph theory)","score":0.36039999127388},{"id":"https://openalex.org/keywords/falsifiability","display_name":"Falsifiability","score":0.358599990606308},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.35100001096725464}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5810999870300293},{"id":"https://openalex.org/C84976871","wikidata":"https://www.wikidata.org/wiki/Q2015673","display_name":"Openness to experience","level":2,"score":0.5670999884605408},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.5613999962806702},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.5252000093460083},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.49950000643730164},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4372999966144562},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.42419999837875366},{"id":"https://openalex.org/C58642233","wikidata":"https://www.wikidata.org/wiki/Q8269924","display_name":"Taxonomy (biology)","level":2,"score":0.37040001153945923},{"id":"https://openalex.org/C33676613","wikidata":"https://www.wikidata.org/wiki/Q13415176","display_name":"Dimension (graph theory)","level":2,"score":0.36039999127388},{"id":"https://openalex.org/C116222747","wikidata":"https://www.wikidata.org/wiki/Q220888","display_name":"Falsifiability","level":2,"score":0.358599990606308},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.35100001096725464},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.32919999957084656},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.3188999891281128},{"id":"https://openalex.org/C2984917352","wikidata":"https://www.wikidata.org/wiki/Q12772819","display_name":"Scientific discovery","level":2,"score":0.3158000111579895},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.30720001459121704},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3034999966621399},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.29739999771118164},{"id":"https://openalex.org/C2780385302","wikidata":"https://www.wikidata.org/wiki/Q367158","display_name":"Protocol (science)","level":3,"score":0.2842999994754791},{"id":"https://openalex.org/C43540301","wikidata":"https://www.wikidata.org/wiki/Q689971","display_name":"Paradigm shift","level":2,"score":0.2809000015258789},{"id":"https://openalex.org/C2780440489","wikidata":"https://www.wikidata.org/wiki/Q5227278","display_name":"Data-driven","level":2,"score":0.2694999873638153},{"id":"https://openalex.org/C2778149293","wikidata":"https://www.wikidata.org/wiki/Q309823","display_name":"Open science","level":2,"score":0.26910001039505005},{"id":"https://openalex.org/C8795937","wikidata":"https://www.wikidata.org/wiki/Q11862829","display_name":"Discipline","level":2,"score":0.2624000012874603},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.26010000705718994},{"id":"https://openalex.org/C2775989810","wikidata":"https://www.wikidata.org/wiki/Q7832978","display_name":"Training manual","level":2,"score":0.25369998812675476},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.25049999356269836}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.27164","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.27164","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.27164","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.27164","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"foundational":[1,138],"pretraining":[2,196],"phase":[3],"determines":[4],"a":[5,27,73,110,120,130,158],"model's":[6],"capability":[7,14],"ceiling,":[8],"as":[9,79,157],"post-training":[10],"struggles":[11],"to":[12,65,116,140,176,208,216],"overcome":[13],"foundations":[15],"established":[16],"during":[17],"pretraining,":[18],"yet":[19],"it":[20,156],"remains":[21],"critically":[22],"under-explored.":[23],"This":[24],"stems":[25],"from":[26,114,123,137,173],"structural":[28],"paradox:":[29],"organizations":[30],"with":[31,61],"computational":[32,51],"resources":[33,60],"operate":[34],"under":[35],"commercial":[36],"pressures":[37],"that":[38,76,95,134],"inhibit":[39],"transparent":[40],"disclosure,":[41],"while":[42,184],"academic":[43],"institutions":[44],"possess":[45],"research":[46,63],"freedom":[47,64],"but":[48],"lack":[49],"pretraining-scale":[50],"resources.":[52],"daVinci-LLM":[53],"occupies":[54],"this":[55],"unexplored":[56],"intersection,":[57],"combining":[58],"industrial-scale":[59],"full":[62,87],"advance":[66],"the":[67,96,106,200,206],"science":[68],"of":[69,195],"pretraining.":[70,222],"We":[71,118],"adopt":[72],"fully-open":[74],"paradigm":[75],"treats":[77],"openness":[78],"scientific":[80,219],"methodology,":[81],"releasing":[82,199],"complete":[83,201],"data":[84,102],"processing":[85,150],"pipelines,":[86],"training":[88],"processes,":[89],"and":[90,213],"systematic":[91,99,214],"exploration":[92,202],"results.":[93],"Recognizing":[94],"field":[97],"lacks":[98],"methodology":[100],"for":[101],"processing,":[103],"we":[104,147,204],"employ":[105],"Data":[107],"Darwinism":[108],"framework,":[109],"principled":[111],"L0-L9":[112],"taxonomy":[113],"filtering":[115],"synthesis.":[117],"train":[119],"3B-parameter":[121],"model":[122],"random":[124],"initialization":[125],"across":[126],"8T":[127],"tokens":[128],"using":[129],"two-stage":[131],"adaptive":[132,171],"curriculum":[133],"progressively":[135],"shifts":[136],"capabilities":[139],"reasoning-intensive":[141],"enhancement.":[142],"Through":[143],"200+":[144],"controlled":[145],"ablations,":[146],"establish":[148],"that:":[149],"depth":[151],"systematically":[152],"enhances":[153],"capabilities,":[154],"establishing":[155],"critical":[159],"dimension":[160],"alongside":[161],"volume":[162],"scaling;":[163],"different":[164],"domains":[165],"exhibit":[166],"distinct":[167],"saturation":[168],"dynamics,":[169],"necessitating":[170],"strategies":[172],"proportion":[174],"adjustments":[175],"format":[177],"shifts;":[178],"compositional":[179],"balance":[180],"enables":[181],"targeted":[182],"intensification":[183],"preventing":[185],"performance":[186],"collapse;":[187],"how":[188],"evaluation":[189],"protocol":[190],"choices":[191],"shape":[192],"our":[193,211],"understanding":[194],"progress.":[197],"By":[198],"process,":[203],"enable":[205],"community":[207],"build":[209],"upon":[210],"findings":[212],"methodologies":[215],"form":[217],"accumulative":[218],"knowledge":[220],"in":[221]},"counts_by_year":[],"updated_date":"2026-04-02T13:53:19.096889","created_date":"2026-04-02T00:00:00"}
