{"id":"https://openalex.org/W7161994113","doi":"https://doi.org/10.48550/arxiv.2605.20520","title":"Open-World Evaluations for Measuring Frontier AI Capabilities","display_name":"Open-World Evaluations for Measuring Frontier AI Capabilities","publication_year":2026,"publication_date":"2026-05-19","ids":{"openalex":"https://openalex.org/W7161994113","doi":"https://doi.org/10.48550/arxiv.2605.20520"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.20520","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.20520","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.20520","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5063345981","display_name":"Sayash Kapoor","orcid":"https://orcid.org/0000-0001-5695-280X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kapoor, Sayash","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136686963","display_name":"Peter Kirgis","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kirgis, Peter","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136602712","display_name":"Andrew Schwartz","orcid":"https://orcid.org/0009-0002-8913-2562"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schwartz, Andrew","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030986512","display_name":"Stephan Rabanser","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rabanser, Stephan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136635746","display_name":"J. J. Allaire","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Allaire, J. J.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136669952","display_name":"Rishi Bommasani","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bommasani, Rishi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087769785","display_name":"Harry Coppock","orcid":"https://orcid.org/0000-0003-2404-8319"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Coppock, Harry","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055765010","display_name":"Magda Dubois","orcid":"https://orcid.org/0000-0002-5396-1855"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dubois, Magda","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136647798","display_name":"Gillian K Hadfield","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hadfield, Gillian K","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136641416","display_name":"Andrew B. Hall","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hall, Andrew B.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078850040","display_name":"Sara Hooker","orcid":"https://orcid.org/0000-0002-0190-6459"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hooker, Sara","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136671922","display_name":"Seth Lazar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lazar, Seth","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136663793","display_name":"Steve Newman","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Newman, Steve","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120841880","display_name":"Dimitris Papailiopoulos","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Papailiopoulos, Dimitris","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054885656","display_name":"Shoshannah Tekofsky","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tekofsky, Shoshannah","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011967325","display_name":"Helen Toner","orcid":"https://orcid.org/0000-0002-2465-2682"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Toner, Helen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136675518","display_name":"Cozmin Ududec","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ududec, Cozmin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136709401","display_name":"Arvind Narayanan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Narayanan, Arvind","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":18,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.2207999974489212,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.2207999974489212,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.09019999951124191,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.08720000088214874,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.7785999774932861},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.5476999878883362},{"id":"https://openalex.org/keywords/frontier","display_name":"Frontier","score":0.5304999947547913},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.44839999079704285},{"id":"https://openalex.org/keywords/tracking","display_name":"Tracking (education)","score":0.4426000118255615},{"id":"https://openalex.org/keywords/term","display_name":"Term (time)","score":0.3610999882221222},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.35040000081062317}],"concepts":[{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.7785999774932861},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.730400025844574},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.5476999878883362},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.5304999947547913},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.44839999079704285},{"id":"https://openalex.org/C2775936607","wikidata":"https://www.wikidata.org/wiki/Q466845","display_name":"Tracking (education)","level":2,"score":0.4426000118255615},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3905999958515167},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.3610999882221222},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.35040000081062317},{"id":"https://openalex.org/C3018587665","wikidata":"https://www.wikidata.org/wiki/Q7268696","display_name":"Qualitative analysis","level":3,"score":0.3321000039577484},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.33180001378059387},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3093999922275543},{"id":"https://openalex.org/C29825287","wikidata":"https://www.wikidata.org/wiki/Q1427940","display_name":"Warning system","level":2,"score":0.2946999967098236},{"id":"https://openalex.org/C2778751112","wikidata":"https://www.wikidata.org/wiki/Q835016","display_name":"Window (computing)","level":2,"score":0.2921000123023987},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.28519999980926514},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2831999957561493},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2759999930858612},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.2583000063896179},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.2583000063896179},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.2522999942302704}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.20520","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.20520","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.20520","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.20520","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Partnerships for the goals","score":0.5508267283439636,"id":"https://metadata.un.org/sdg/17"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Benchmark-based":[0],"evaluation":[1],"remains":[2],"important":[3],"for":[4,44,86,92,150],"tracking":[5],"frontier":[6],"AI":[7,88,104],"progress.":[8],"But":[9],"it":[10,19],"can":[11,23,135],"both":[12],"overstate":[13],"and":[14,33,38,79,81,108,152],"understate":[15],"deployed":[16],"capability":[17],"because":[18],"privileges":[20],"tasks":[21,58],"that":[22,132,141],"be":[24],"precisely":[25],"specified,":[26],"automatically":[27],"graded,":[28],"easy":[29],"to":[30,114],"optimize":[31],"for,":[32],"run":[34],"with":[35,106,124,148],"low":[36],"budgets":[37],"short":[39],"time":[40],"horizons.":[41],"We":[42,146],"advocate":[43],"a":[45,90,98,110,126],"complementary":[46],"class":[47],"of":[48,139],"evaluations,":[49,75],"which":[50],"we":[51,71,101],"term":[52],"open-world":[53,74,133,154],"evaluations:":[54],"long-horizon,":[55],"messy,":[56],"real-world":[57],"assessed":[59],"through":[60],"small-sample":[61],"qualitative":[62],"analysis":[63],"rather":[64],"than":[65],"benchmark-scale":[66],"automation.":[67],"In":[68],"this":[69],"paper":[70],"survey":[72],"recent":[73],"identify":[76],"their":[77],"strengths":[78],"limitations,":[80],"introduce":[82],"CRUX":[83],"(Collaborative":[84],"Research":[85],"Updating":[87],"eXpectations),":[89],"project":[91],"conducting":[93],"such":[94],"evaluations":[95,134],"regularly.":[96],"As":[97],"first":[99],"instance,":[100],"task":[102,123],"an":[103],"agent":[105,120],"developing":[107],"publishing":[109],"simple":[111],"iOS":[112],"application":[113],"the":[115,122],"Apple":[116],"App":[117],"Store.":[118],"The":[119],"completed":[121],"only":[125],"single":[127],"avoidable":[128],"manual":[129],"intervention,":[130],"suggesting":[131],"provide":[136],"early":[137],"warning":[138],"capabilities":[140],"may":[142],"soon":[143],"become":[144],"widespread.":[145],"conclude":[147],"recommendations":[149],"designing":[151],"reporting":[153],"evals.":[155]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-22T00:00:00"}
