{"id":"https://openalex.org/W7162409218","doi":"https://doi.org/10.48550/arxiv.2605.26087","title":"DiscoverPhysics: Benchmarking LLMs for Out-of-the-Box Scientific Thinking","display_name":"DiscoverPhysics: Benchmarking LLMs for Out-of-the-Box Scientific Thinking","publication_year":2026,"publication_date":"2026-05-25","ids":{"openalex":"https://openalex.org/W7162409218","doi":"https://doi.org/10.48550/arxiv.2605.26087"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.26087","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26087","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.26087","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135060900","display_name":"Matt L. Wiemann","orcid":"https://orcid.org/0000-0001-5748-5393"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wiemann, Matt L.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137019195","display_name":"Lindsay M. Smith","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Smith, Lindsay M.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066501666","display_name":"P. Melchior","orcid":"https://orcid.org/0000-0002-8873-5065"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Melchior, Peter","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137066441","display_name":"Siddharth Mishra-Sharma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mishra-Sharma, Siddharth","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101694421","display_name":"Andrew Gordon Wilson","orcid":"https://orcid.org/0000-0002-2011-3315"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wilson, Andrew Gordon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069234607","display_name":"Pavel Izmailov","orcid":"https://orcid.org/0009-0005-0772-2879"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Izmailov, Pavel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5034486773","display_name":"Carolina Cuesta-Lazaro","orcid":"https://orcid.org/0000-0002-6069-2999"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cuesta-L\u00e1zaro, Carolina","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.43380001187324524,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.43380001187324524,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.10249999910593033,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.03669999912381172,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/rubric","display_name":"Rubric","score":0.6255999803543091},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.621399998664856},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5490000247955322},{"id":"https://openalex.org/keywords/frontier","display_name":"Frontier","score":0.4332999885082245},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4320000112056732},{"id":"https://openalex.org/keywords/trajectory","display_name":"Trajectory","score":0.42480000853538513},{"id":"https://openalex.org/keywords/grasp","display_name":"GRASP","score":0.4025000035762787},{"id":"https://openalex.org/keywords/scientific-reasoning","display_name":"Scientific reasoning","score":0.37720000743865967},{"id":"https://openalex.org/keywords/python","display_name":"Python (programming language)","score":0.3395000100135803}],"concepts":[{"id":"https://openalex.org/C111640148","wikidata":"https://www.wikidata.org/wiki/Q847349","display_name":"Rubric","level":2,"score":0.6255999803543091},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.621399998664856},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5490000247955322},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.4332999885082245},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4320000112056732},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.42480000853538513},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.4025000035762787},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.3971000015735626},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3926999866962433},{"id":"https://openalex.org/C2992562121","wikidata":"https://www.wikidata.org/wiki/Q3817808","display_name":"Scientific reasoning","level":2,"score":0.37720000743865967},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.3650999963283539},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.3634999990463257},{"id":"https://openalex.org/C519991488","wikidata":"https://www.wikidata.org/wiki/Q28865","display_name":"Python (programming language)","level":2,"score":0.3395000100135803},{"id":"https://openalex.org/C108650721","wikidata":"https://www.wikidata.org/wiki/Q1783253","display_name":"Counterfactual thinking","level":2,"score":0.33379998803138733},{"id":"https://openalex.org/C64802038","wikidata":"https://www.wikidata.org/wiki/Q131012","display_name":"Occam's razor","level":2,"score":0.3292999863624573},{"id":"https://openalex.org/C81669768","wikidata":"https://www.wikidata.org/wiki/Q2359161","display_name":"Precision and recall","level":2,"score":0.3158999979496002},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3142000138759613},{"id":"https://openalex.org/C138379479","wikidata":"https://www.wikidata.org/wiki/Q1116876","display_name":"Scientific modelling","level":2,"score":0.298799991607666},{"id":"https://openalex.org/C2778145024","wikidata":"https://www.wikidata.org/wiki/Q17052147","display_name":"Dismissal","level":2,"score":0.29269999265670776},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.2906999886035919},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.26669999957084656},{"id":"https://openalex.org/C29554801","wikidata":"https://www.wikidata.org/wiki/Q147027","display_name":"Thought experiment","level":2,"score":0.26100000739097595},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.2605000138282776},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.25929999351501465},{"id":"https://openalex.org/C136172866","wikidata":"https://www.wikidata.org/wiki/Q1088088","display_name":"Possible world","level":2,"score":0.2581000030040741},{"id":"https://openalex.org/C14224292","wikidata":"https://www.wikidata.org/wiki/Q13600188","display_name":"Conceptual framework","level":2,"score":0.25780001282691956},{"id":"https://openalex.org/C79585631","wikidata":"https://www.wikidata.org/wiki/Q431498","display_name":"Confirmation bias","level":2,"score":0.25459998846054077}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.26087","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26087","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.26087","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26087","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7531250715255737,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Frontier":[0],"LLMs":[1],"now":[2],"perform":[3],"strongly":[4],"across":[5],"a":[6,33,43,102,110,119],"wide":[7],"range":[8],"of":[9,22,40,42,92,105,113,165,181],"physics":[10,47,108],"evaluations,":[11],"but":[12],"it":[13],"is":[14,77],"hard":[15],"to":[16,36,124,206],"disentangle":[17],"genuine":[18],"reasoning":[19,136],"from":[20,50,214],"recall":[21],"established":[23],"science.":[24],"We":[25,53,141,217],"introduce":[26],"DiscoverPhysics,":[27],"an":[28,82,138,154,159],"interactive":[29],"benchmark":[30,133],"that":[31,174,220,231],"asks":[32],"LLM":[34],"agent":[35,88,123],"discover":[37],"the":[38,87,106,114,122,132,175,182,215],"laws":[39],"motion":[41],"simulated":[44],"world":[45,76,120],"whose":[46],"deliberately":[48],"deviates":[49],"our":[51],"own.":[52],"construct":[54],"22":[55],"worlds":[56,183],"governed":[57],"by,":[58],"among":[59],"others,":[60],"screened":[61],"and":[62,72,98,109,128,153,184,210,230],"fractional-power":[63],"gravity,":[64],"multi-species":[65],"couplings,":[66],"hidden":[67],"dark-matter-like":[68],"particles,":[69],"non-coordinate-free":[70],"physics,":[71],"time-varying":[73],"interactions.":[74],"Each":[75],"generated":[78],"on":[79,150,187,235],"demand":[80],"by":[81],"N-body":[83],"simulator,":[84],"for":[85],"which":[86],"proposes":[89],"several":[90],"rounds":[91],"experiments,":[93],"observes":[94],"raw":[95],"trajectory":[96,148],"data,":[97],"ultimately":[99],"submits":[100],"both":[101,202],"natural-language":[103],"explanation":[104,156,228],"world's":[107],"Python":[111],"implementation":[112],"inferred":[115],"law.":[116],"Because":[117],"solving":[118],"requires":[121],"design":[125,207],"informative":[126,208],"experiments":[127,209],"revise":[129],"its":[130],"hypotheses,":[131],"probes":[134],"long-horizon":[135],"over":[137],"experimental":[139],"history.":[140],"evaluate":[142],"submissions":[143],"along":[144],"two":[145],"complementary":[146],"axes:":[147],"MSE":[149],"held-out":[151],"particles":[152],"LLM-judged":[155],"score":[157],"following":[158],"expert-written":[160],"rubric":[161],"assessing":[162],"conceptual":[163,232],"understanding":[164,233],"each":[166],"world.":[167],"Across":[168],"eleven":[169],"frontier":[170],"models,":[171,201],"we":[172],"find":[173,219],"strongest":[176],"agents":[177],"pass":[178],"only":[179],"half":[180],"consistently":[185],"fail":[186],"those":[188],"where":[189],"latent":[190],"structure":[191],"must":[192],"be":[193],"uncovered.":[194],"Open-source":[195],"models":[196],"lag":[197],"substantially":[198],"behind":[199],"commercial":[200],"in":[203,211],"their":[204],"ability":[205],"extracting":[212],"conclusions":[213],"data.":[216],"further":[218],"good":[221],"predictive":[222],"accuracy":[223],"does":[224],"not":[225],"guarantee":[226],"high":[227],"quality":[229],"depends":[234],"hypothesis":[236],"refinement":[237],"through":[238],"well-chosen":[239],"experiments.":[240]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-27T00:00:00"}
