{"id":"https://openalex.org/W7157897572","doi":"https://doi.org/10.48550/arxiv.2604.24819","title":"Programming with Data: Test-Driven Data Engineering for Self-Improving LLMs from Raw Corpora","display_name":"Programming with Data: Test-Driven Data Engineering for Self-Improving LLMs from Raw Corpora","publication_year":2026,"publication_date":"2026-04-27","ids":{"openalex":"https://openalex.org/W7157897572","doi":"https://doi.org/10.48550/arxiv.2604.24819"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.24819","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.24819","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.24819","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134201770","display_name":"Chenkai Pan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Pan, Chenkai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134826654","display_name":"Xinglong Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Xinglong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134844820","display_name":"Yuhang Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Yuhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085208430","display_name":"Yujun Wu","orcid":"https://orcid.org/0000-0001-9250-5046"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yujun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134871631","display_name":"Siyuan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Siyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134835833","display_name":"Jintao Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jintao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134852661","display_name":"Conghui He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Conghui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134838640","display_name":"Jingxuan Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Jingxuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134845881","display_name":"Cheng Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Cheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5134201770"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.3792000114917755,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.3792000114917755,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.08659999817609787,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.0560000017285347,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.5498999953269958},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.5339999794960022},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.49079999327659607},{"id":"https://openalex.org/keywords/data-modeling","display_name":"Data modeling","score":0.4896000027656555},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.48420000076293945},{"id":"https://openalex.org/keywords/domain-knowledge","display_name":"Domain knowledge","score":0.47620001435279846},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.43619999289512634},{"id":"https://openalex.org/keywords/raw-data","display_name":"Raw data","score":0.4066999852657318},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.39730000495910645},{"id":"https://openalex.org/keywords/domain-model","display_name":"Domain model","score":0.3756999969482422}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.727400004863739},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.5498999953269958},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.5339999794960022},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.52920001745224},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5048999786376953},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.49079999327659607},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.4896000027656555},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.48420000076293945},{"id":"https://openalex.org/C207685749","wikidata":"https://www.wikidata.org/wiki/Q2088941","display_name":"Domain knowledge","level":2,"score":0.47620001435279846},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.43619999289512634},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.42260000109672546},{"id":"https://openalex.org/C132964779","wikidata":"https://www.wikidata.org/wiki/Q2110223","display_name":"Raw data","level":2,"score":0.4066999852657318},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.39730000495910645},{"id":"https://openalex.org/C92548554","wikidata":"https://www.wikidata.org/wiki/Q2262868","display_name":"Domain model","level":3,"score":0.3756999969482422},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3630000054836273},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3587999939918518},{"id":"https://openalex.org/C100463513","wikidata":"https://www.wikidata.org/wiki/Q5227322","display_name":"Data model (GIS)","level":2,"score":0.3580999970436096},{"id":"https://openalex.org/C161301231","wikidata":"https://www.wikidata.org/wiki/Q3478658","display_name":"Knowledge representation and reasoning","level":2,"score":0.3481999933719635},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.32690000534057617},{"id":"https://openalex.org/C135257023","wikidata":"https://www.wikidata.org/wiki/Q691358","display_name":"Domain-specific language","level":2,"score":0.3240000009536743},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3199000060558319},{"id":"https://openalex.org/C180152950","wikidata":"https://www.wikidata.org/wiki/Q2904257","display_name":"Software development process","level":4,"score":0.3197999894618988},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.31790000200271606},{"id":"https://openalex.org/C5977032","wikidata":"https://www.wikidata.org/wiki/Q5289815","display_name":"Domain engineering","level":5,"score":0.3165000081062317},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.3046000003814697},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.29910001158714294},{"id":"https://openalex.org/C529173508","wikidata":"https://www.wikidata.org/wiki/Q638608","display_name":"Software development","level":3,"score":0.2904999852180481},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.2775000035762787},{"id":"https://openalex.org/C35280785","wikidata":"https://www.wikidata.org/wiki/Q559486","display_name":"System lifecycle","level":4,"score":0.27649998664855957},{"id":"https://openalex.org/C148027188","wikidata":"https://www.wikidata.org/wiki/Q907375","display_name":"Unit testing","level":3,"score":0.27549999952316284},{"id":"https://openalex.org/C116409475","wikidata":"https://www.wikidata.org/wiki/Q1385056","display_name":"External Data Representation","level":2,"score":0.274399995803833},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2743000090122223},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2720000147819519},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2718999981880188},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.2687999904155731},{"id":"https://openalex.org/C120567893","wikidata":"https://www.wikidata.org/wiki/Q1582085","display_name":"Knowledge extraction","level":2,"score":0.26190000772476196},{"id":"https://openalex.org/C169806903","wikidata":"https://www.wikidata.org/wiki/Q5937752","display_name":"Human error","level":2,"score":0.2533999979496002},{"id":"https://openalex.org/C207850805","wikidata":"https://www.wikidata.org/wiki/Q269608","display_name":"Reverse engineering","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.24819","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.24819","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.24819","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.24819","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Reliably":[0],"transferring":[1],"specialized":[2],"human":[3,239],"knowledge":[4,71,201],"from":[5,74],"text":[6],"into":[7,136,241],"large":[8],"language":[9,242],"models":[10],"remains":[11],"a":[12,34,38,69,100,199,231],"fundamental":[13],"challenge":[14],"in":[15,50,99,150],"artificial":[16],"intelligence.":[17],"Fine-tuning":[18],"on":[19,37],"domain":[20,39],"corpora":[21],"has":[22],"enabled":[23],"substantial":[24],"capability":[25],"gains,":[26],"but":[27],"the":[28,51,55,75,80,89,95,112,151,189,195,214,235],"process":[29],"operates":[30],"without":[31,170],"feedback:":[32],"when":[33,68],"model":[35,113,116,133,166,220],"fails":[36],"task,":[40],"there":[41],"is":[42,48,58,222],"no":[43],"method":[44],"to":[45,59,147],"diagnose":[46],"what":[47,111],"deficient":[49],"training":[52,85,105,117,206,217],"data,":[53],"and":[54,87,102,124,139,153,168,182,194,205,219,225],"only":[56],"recourse":[57],"add":[60],"more":[61],"data":[62,86,106,126,152,218],"indiscriminately.":[63],"Here":[64],"we":[65],"show":[66],"that":[67,142,213],"structured":[70,200],"representation":[72],"extracted":[73],"source":[76,108],"corpus":[77,207],"serves":[78],"as":[79,178,208],"shared":[81],"foundation":[82,233],"for":[83,234],"both":[84],"evaluation,":[88],"complete":[90],"data-engineering":[91],"lifecycle":[92,98],"maps":[93],"onto":[94],"software":[96],"development":[97],"precise":[101],"operative":[103],"way:":[104],"becomes":[107,118,121,128],"code":[109],"specifying":[110],"should":[114],"learn,":[115],"compilation,":[119],"benchmarking":[120],"unit":[122],"testing,":[123],"failure-driven":[125],"repair":[127,160],"debugging.":[129],"Under":[130],"this":[131,176,228],"correspondence,":[132],"failures":[134],"decompose":[135],"concept-level":[137],"gaps":[138],"reasoning-chain":[140],"breaks":[141],"can":[143],"be":[144],"traced":[145],"back":[146],"specific":[148],"deficiencies":[149],"repaired":[154],"through":[155],"targeted":[156],"patches,":[157],"with":[158,180],"each":[159],"cycle":[161],"producing":[162],"consistent":[163],"improvements":[164],"across":[165,185],"scales":[167],"architectures":[169],"degrading":[171],"general":[172],"capabilities.":[173],"We":[174],"formalize":[175],"principle":[177],"Programming":[179],"Data":[181],"instantiate":[183],"it":[184],"sixteen":[186],"disciplines":[187],"spanning":[188],"natural":[190],"sciences,":[191,197],"engineering,":[192],"biomedicine,":[193],"social":[196],"releasing":[198],"base,":[202],"benchmark":[203],"suite,":[204],"open":[209],"resources.":[210],"By":[211],"demonstrating":[212],"relationship":[215],"between":[216],"behaviour":[221],"structurally":[223],"traceable":[224],"systematically":[226],"repairable,":[227],"work":[229],"establishes":[230],"principled":[232],"reliable":[236],"engineering":[237],"of":[238],"expertise":[240],"models.":[243]},"counts_by_year":[],"updated_date":"2026-04-30T06:11:10.768123","created_date":"2026-04-30T00:00:00"}
