{"id":"https://openalex.org/W7154461864","doi":"https://doi.org/10.48550/arxiv.2604.13018","title":"Toward Autonomous Long-Horizon Engineering for ML Research","display_name":"Toward Autonomous Long-Horizon Engineering for ML Research","publication_year":2026,"publication_date":"2026-04-14","ids":{"openalex":"https://openalex.org/W7154461864","doi":"https://doi.org/10.48550/arxiv.2604.13018"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.13018","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13018","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.13018","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133667642","display_name":"Guoxin Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Guoxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133629043","display_name":"Jie Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133636341","display_name":"Lei Chen","orcid":"https://orcid.org/0009-0002-1970-7930"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Lei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133687745","display_name":"Jiale Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Jiale","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125627854","display_name":"Fanzhe Meng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Meng, Fanzhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133709522","display_name":"Wayne Xin Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Wayne Xin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133688617","display_name":"Ruihua Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Ruihua","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100420584","display_name":"Cheng Chen","orcid":"https://orcid.org/0000-0003-2105-8191"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Cheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133718586","display_name":"Ji-Rong Wen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wen, Ji-Rong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133654601","display_name":"Kai Jia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jia, Kai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5133667642"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.3287999927997589,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.3287999927997589,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.08110000193119049,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.05420000106096268,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/orchestration","display_name":"Orchestration","score":0.8197000026702881},{"id":"https://openalex.org/keywords/debugging","display_name":"Debugging","score":0.7285000085830688},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.656499981880188},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5151000022888184},{"id":"https://openalex.org/keywords/workspace","display_name":"Workspace","score":0.48649999499320984},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.47690001130104065},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.45500001311302185},{"id":"https://openalex.org/keywords/state","display_name":"State (computer science)","score":0.4392000138759613}],"concepts":[{"id":"https://openalex.org/C199168358","wikidata":"https://www.wikidata.org/wiki/Q3367000","display_name":"Orchestration","level":3,"score":0.8197000026702881},{"id":"https://openalex.org/C168065819","wikidata":"https://www.wikidata.org/wiki/Q845566","display_name":"Debugging","level":2,"score":0.7285000085830688},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.656499981880188},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.649399995803833},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5151000022888184},{"id":"https://openalex.org/C58581272","wikidata":"https://www.wikidata.org/wiki/Q12741163","display_name":"Workspace","level":3,"score":0.48649999499320984},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.47690001130104065},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.45890000462532043},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.45500001311302185},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.4392000138759613},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.4165000021457672},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.3856000006198883},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36719998717308044},{"id":"https://openalex.org/C2780385302","wikidata":"https://www.wikidata.org/wiki/Q367158","display_name":"Protocol (science)","level":3,"score":0.36640000343322754},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.35269999504089355},{"id":"https://openalex.org/C207850805","wikidata":"https://www.wikidata.org/wiki/Q269608","display_name":"Reverse engineering","level":2,"score":0.32170000672340393},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.30230000615119934},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3018999993801117},{"id":"https://openalex.org/C13687954","wikidata":"https://www.wikidata.org/wiki/Q4826847","display_name":"Autonomous agent","level":2,"score":0.2937999963760376},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.2854999899864197},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2833000123500824},{"id":"https://openalex.org/C2778827112","wikidata":"https://www.wikidata.org/wiki/Q22245680","display_name":"Feature engineering","level":3,"score":0.27250000834465027},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.2696000039577484}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.13018","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13018","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.13018","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13018","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.42658284306526184,"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Agentic":[0],"systems":[1,18,197],"increasingly":[2],"automate":[3],"pieces":[4],"of":[5,191,199],"AI":[6,184],"research.":[7],"Yet":[8],"turning":[9],"underspecified":[10],"research":[11,31,35,80,185],"objectives":[12],"into":[13,37],"runnable,":[14],"experimentally":[15],"validated":[16],"ML":[17,30,40],"remains":[19],"a":[20,34,38,67,77,84,138,189,196],"central":[21,49,159],"bottleneck.":[22],"We":[23,64],"study":[24],"this":[25],"operational":[26],"setting":[27],"as":[28],"\\emph{long-horizon":[29],"engineering}:":[32],"converting":[33],"specification":[36],"runnable":[39],"system":[41,69],"through":[42,83],"repeated":[43],"implementation,":[44],"experimentation,":[45],"and":[46,93,106,111,133,136,150,171],"refinement.":[47],"The":[48],"challenge":[50],"is":[51,158,186],"to":[52,160],"sustain":[53],"cumulative":[54],"project":[55,156,203],"progress":[56],"across":[57,91],"heterogeneous":[58],"stages":[59],"under":[60,122],"delayed,":[61],"confounded":[62],"feedback.":[63],"introduce":[65],"AiScientist,":[66],"multi-agent":[68],"built":[70],"around":[71],"thin":[72],"control":[73],"over":[74,99,126],"thick":[75],"state:":[76],"lightweight":[78],"hierarchical":[79],"team":[81],"coordinates":[82],"File-as-Bus":[85,164],"workspace":[86],"that":[87,154,182],"preserves":[88],"decision-relevant":[89],"artifacts":[90],"roles":[92],"invocations.":[94],"On":[95,114],"PaperBench,":[96],"AiScientist":[97],"improves":[98],"the":[100,127],"strongest":[101,128],"matched":[102,129],"baselines":[103,130],"by":[104,131,144,168,176],"9.92":[105],"11.15":[107],"points":[108,170],"with":[109],"Gemini-3-Flash":[110],"GLM-5,":[112],"respectively.":[113],"MLE-Bench":[115,172],"Lite,":[116],"it":[117],"reaches":[118],"81.82":[119],"Any":[120,146,174],"Medal\\%":[121,175],"both":[123],"backbones,":[124],"improving":[125],"4.55":[132],"16.67":[134],"points,":[135],"exceeding":[137],"Codex/GPT-5.5":[139],"xhigh":[140],"frontier":[141],"harness":[142],"reference":[143],"13.64":[145],"Medal":[147],"points.":[148,178],"Ablations":[149],"process":[151],"analyses":[152],"show":[153],"durable":[155],"state":[157],"later-round":[161],"refinement:":[162],"removing":[163],"lowers":[165],"PaperBench":[166],"score":[167],"6.41":[169],"Lite":[173],"31.82":[177],"These":[179],"results":[180],"suggest":[181],"long-horizon":[183],"not":[187],"only":[188],"problem":[190,198],"stronger":[192],"local":[193],"reasoning,":[194],"but":[195],"maintaining":[200],"cumulative,":[201],"inspectable":[202],"progress.":[204]},"counts_by_year":[],"updated_date":"2026-05-28T06:12:49.907903","created_date":"2026-04-16T00:00:00"}
