{"id":"https://openalex.org/W7160945459","doi":"https://doi.org/10.48550/arxiv.2605.08678","title":"MLS-Bench: A Holistic and Rigorous Assessment of AI Systems on Building Better AI","display_name":"MLS-Bench: A Holistic and Rigorous Assessment of AI Systems on Building Better AI","publication_year":2026,"publication_date":"2026-05-09","ids":{"openalex":"https://openalex.org/W7160945459","doi":"https://doi.org/10.48550/arxiv.2605.08678"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.08678","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08678","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.08678","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5018115183","display_name":"Bohan Lyu","orcid":"https://orcid.org/0009-0005-6462-8942"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyu, Bohan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135933574","display_name":"Yucheng Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Yucheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135924497","display_name":"Siqiao Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Siqiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059144000","display_name":"Jiaru Zhang","orcid":"https://orcid.org/0000-0001-5909-1005"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jiaru","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120840761","display_name":"Qixin Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Qixin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135956031","display_name":"Xinghan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xinghan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112895215","display_name":"Xinyang Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Xinyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124971679","display_name":"Yicheng Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yicheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135998780","display_name":"Huaqing Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Huaqing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135925239","display_name":"Runhan Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Runhan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135987387","display_name":"Kaicheng Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Kaicheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135920624","display_name":"Zitao Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Zitao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039366956","display_name":"Wentao Guo","orcid":"https://orcid.org/0000-0001-8058-8323"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Wentao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101942528","display_name":"Junlin Yang","orcid":"https://orcid.org/0000-0001-5381-169X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Junlin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135974947","display_name":"Xinyue Ai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ai, Xinyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135948434","display_name":"Wenhao Chai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chai, Wenhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135919765","display_name":"Yadi Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Yadi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108946224","display_name":"Ziran Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Ziran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135987419","display_name":"Kun Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Kun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133664397","display_name":"Dapeng Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Dapeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135926485","display_name":"Huan-ang Gao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Huan-ang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135988677","display_name":"Shange Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Shange","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082788397","display_name":"Chengshuai Shi","orcid":"https://orcid.org/0000-0002-2727-8251"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Chengshuai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136001104","display_name":"Simon S. Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Du, Simon S.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037154191","display_name":"Max Simchowitz","orcid":"https://orcid.org/0000-0001-9900-1238"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Simchowitz, Max","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034192173","display_name":"Jiantao Jiao","orcid":"https://orcid.org/0000-0003-3766-8031"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiao, Jiantao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135989871","display_name":"Dawn Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Dawn","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135957549","display_name":"Chi Jin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Chi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":28,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.10360000282526016,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.10360000282526016,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.07769999653100967,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.0729999989271164,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.7609000205993652},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6850000023841858},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6284000277519226},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.5146999955177307},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.49639999866485596},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.47909998893737793}],"concepts":[{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.7609000205993652},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7139999866485596},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6850000023841858},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6284000277519226},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.5146999955177307},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.49639999866485596},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.47909998893737793},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.47189998626708984},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4544999897480011},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4320000112056732},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.40939998626708984},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3635999858379364},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.319599986076355},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.30410000681877136},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.25369998812675476}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.08678","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08678","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.08678","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08678","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Modern":[0],"AI":[1,58],"progress":[2],"has":[3],"been":[4],"driven":[5],"by":[6],"ML":[7,65,85],"methods":[8,43],"that":[9,91,102,113,152],"are":[10],"generalizable":[11,62],"across":[12,71,95],"settings":[13,97],"and":[14,29,63,89,98,112,135,172,190,197,200,204],"scalable":[15,64],"to":[16,36,78,169],"larger":[17],"regimes.":[18],"As":[19],"large":[20],"language":[21],"models":[22],"demonstrate":[23,90],"advanced":[24],"capabilities":[25],"in":[26,158,164],"reasoning,":[27],"coding,":[28],"engineering":[30],"tasks,":[31],"it":[32],"is":[33,116,155],"increasingly":[34],"important":[35],"understand":[37],"whether":[38,57],"they":[39],"can":[40,60],"discover":[41],"such":[42],"rather":[44],"than":[45,120],"only":[46,157],"apply":[47],"existing":[48],"ones.":[49],"We":[50,100,124,188],"introduce":[51],"MLS-Bench,":[52],"a":[53,192],"benchmark":[54],"for":[55,118,195],"evaluating":[56],"systems":[59],"invent":[61],"methods.":[66],"MLS-Bench":[67],"contains":[68],"140":[69],"tasks":[70],"12":[72],"domains,":[73],"each":[74],"requiring":[75],"an":[76,84],"agent":[77],"improve":[79],"one":[80],"targeted":[81],"component":[82],"of":[83,129,146],"system":[86],"or":[87,180],"algorithm":[88],"the":[92,127,153,165,202],"improvement":[93],"generalizes":[94],"controlled":[96],"scales.":[99],"find":[101],"current":[103],"agents":[104],"remain":[105],"far":[106],"from":[107],"reliably":[108],"surpassing":[109],"human-designed":[110],"methods,":[111,161],"engineering-style":[114],"tuning":[115],"easier":[117],"them":[119],"genuine":[121],"method":[122],"invention.":[123],"further":[125],"study":[126],"effects":[128],"test-time":[130],"scaling,":[131],"adaptive":[132],"compute":[133],"allocation,":[134],"context":[136,181],"provision":[137],"on":[138],"agents'":[139],"discovery":[140],"performance,":[141],"together":[142],"with":[143],"case":[144],"studies":[145],"their":[147],"behavior.":[148],"Our":[149],"analyses":[150],"suggest":[151],"bottleneck":[154],"not":[156,184],"proposing":[159],"new":[160],"but":[162],"also":[163],"scientific":[166],"insight":[167],"needed":[168],"plan,":[170],"validate,":[171],"scale":[173],"claims":[174],"about":[175],"them.":[176],"More":[177],"search,":[178],"compute,":[179],"alone":[182],"does":[183],"remove":[185],"this":[186],"bottleneck.":[187],"build":[189],"maintain":[191],"community":[193],"platform":[194],"cumulative":[196],"comparable":[198],"iteration,":[199],"release":[201],"data":[203],"code":[205],"at":[206],"https://mls-bench.com.":[207]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-13T00:00:00"}
