{"id":"https://openalex.org/W7161237792","doi":"https://doi.org/10.48550/arxiv.2605.13950","title":"Collider-Bench: Benchmarking AI Agents with Particle Physics Analysis Reproduction","display_name":"Collider-Bench: Benchmarking AI Agents with Particle Physics Analysis Reproduction","publication_year":2026,"publication_date":"2026-05-13","ids":{"openalex":"https://openalex.org/W7161237792","doi":"https://doi.org/10.48550/arxiv.2605.13950"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.13950","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.13950","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.13950","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5071317702","display_name":"Darius A. Faroughy","orcid":"https://orcid.org/0000-0002-4027-5477"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Faroughy, Darius A.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136194775","display_name":"Sofia Palacios Schweitzer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schweitzer, Sofia Palacios","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062933995","display_name":"Ian Pang","orcid":"https://orcid.org/0000-0002-8225-7269"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pang, Ian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136213197","display_name":"Siddharth Mishra-Sharma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mishra-Sharma, Siddharth","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5072503522","display_name":"David Shih","orcid":"https://orcid.org/0000-0003-3408-3871"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shih, David","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.3314000070095062,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.3314000070095062,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.08940000087022781,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13643","display_name":"Artificial Intelligence in Law","score":0.043800000101327896,"subfield":{"id":"https://openalex.org/subfields/3320","display_name":"Political Science and International Relations"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/executable","display_name":"Executable","score":0.6531999707221985},{"id":"https://openalex.org/keywords/codebase","display_name":"Codebase","score":0.6510000228881836},{"id":"https://openalex.org/keywords/toolchain","display_name":"Toolchain","score":0.5544000267982483},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.5020999908447266},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.44589999318122864},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.44519999623298645},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4438999891281128},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.43299999833106995},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.429500013589859},{"id":"https://openalex.org/keywords/session","display_name":"Session (web analytics)","score":0.4185999929904938}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6715999841690063},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.6531999707221985},{"id":"https://openalex.org/C51929080","wikidata":"https://www.wikidata.org/wiki/Q2425187","display_name":"Codebase","level":3,"score":0.6510000228881836},{"id":"https://openalex.org/C2777062904","wikidata":"https://www.wikidata.org/wiki/Q545406","display_name":"Toolchain","level":3,"score":0.5544000267982483},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.5020999908447266},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.44589999318122864},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.44519999623298645},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4438999891281128},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.43299999833106995},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.429500013589859},{"id":"https://openalex.org/C2779182362","wikidata":"https://www.wikidata.org/wiki/Q17126187","display_name":"Session (web analytics)","level":2,"score":0.4185999929904938},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.4180999994277954},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.37459999322891235},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3686999976634979},{"id":"https://openalex.org/C87668248","wikidata":"https://www.wikidata.org/wiki/Q40605","display_name":"Large Hadron Collider","level":2,"score":0.3684999942779541},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.359499990940094},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.3497999906539917},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.3418999910354614},{"id":"https://openalex.org/C105002631","wikidata":"https://www.wikidata.org/wiki/Q4833645","display_name":"Subject-matter expert","level":3,"score":0.3416000008583069},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3359000086784363},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.33320000767707825},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.31049999594688416},{"id":"https://openalex.org/C138673069","wikidata":"https://www.wikidata.org/wiki/Q322229","display_name":"Tracing","level":2,"score":0.3037000000476837},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.30160000920295715},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2994000017642975},{"id":"https://openalex.org/C2780385302","wikidata":"https://www.wikidata.org/wiki/Q367158","display_name":"Protocol (science)","level":3,"score":0.2856999933719635},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.28540000319480896},{"id":"https://openalex.org/C519991488","wikidata":"https://www.wikidata.org/wiki/Q28865","display_name":"Python (programming language)","level":2,"score":0.2782999873161316},{"id":"https://openalex.org/C167981075","wikidata":"https://www.wikidata.org/wiki/Q2667186","display_name":"Sandbox (software development)","level":2,"score":0.27570000290870667},{"id":"https://openalex.org/C2780264999","wikidata":"https://www.wikidata.org/wiki/Q7445032","display_name":"Security domain","level":2,"score":0.2736999988555908},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.2702000141143799},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.26809999346733093},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.2671999931335449},{"id":"https://openalex.org/C153180980","wikidata":"https://www.wikidata.org/wiki/Q19776675","display_name":"Commit","level":2,"score":0.2655999958515167},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.2653999924659729},{"id":"https://openalex.org/C75291252","wikidata":"https://www.wikidata.org/wiki/Q1315756","display_name":"TRACE (psycholinguistics)","level":2,"score":0.2630000114440918},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.2603999972343445},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.2587999999523163},{"id":"https://openalex.org/C112505250","wikidata":"https://www.wikidata.org/wiki/Q787116","display_name":"Automaton","level":2,"score":0.25270000100135803}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.13950","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.13950","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.13950","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.13950","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Autonomous":[0],"language-model":[1],"agents":[2,36],"are":[3,57,131],"increasingly":[4],"evaluated":[5,132],"on":[6,93,217],"long-horizon":[7],"tool-use":[8],"tasks,":[9],"but":[10],"existing":[11],"benchmarks":[12],"rarely":[13],"capture":[14],"the":[15,42,63,68,73,77,107,149,161,223],"complexity":[16],"and":[17,51,98,119,163,180,198],"nuance":[18],"of":[19,187,208],"real":[20],"scientific":[21,53],"work.":[22],"To":[23],"address":[24],"this":[25],"gap,":[26],"we":[27,159],"introduce":[28],"Collider-Bench,":[29],"a":[30,86,111,143,195,205],"benchmark":[31],"for":[32,85],"evaluating":[33],"whether":[34],"LLM":[35,169],"can":[37],"reproduce":[38,61],"experimental":[39,74],"analyses":[40,56],"from":[41,190],"Large":[43],"Hadron":[44],"Collider":[45],"(LHC)":[46],"using":[47,167],"only":[48,66],"public":[49,64],"papers":[50,79],"open":[52],"software.":[54],"Such":[55],"often":[58],"difficult":[59],"to":[60,100,109,171],"because":[62],"toolchain":[65],"approximates":[67],"software":[69],"used":[70],"internally":[71],"by":[72,153],"collaborations,":[75],"while":[76],"published":[78,112],"inevitably":[80],"omit":[81],"implementation":[82],"details":[83],"needed":[84],"faithful":[87],"reconstruction.":[88],"Agents":[89],"must":[90],"therefore":[91],"rely":[92],"physical":[94],"reasoning,":[95],"domain":[96],"knowledge,":[97],"trial-and-error":[99],"fill":[101],"these":[102],"gaps.":[103],"Each":[104],"task":[105],"requires":[106],"agent":[108,155,220],"turn":[110],"analysis":[113],"into":[114],"an":[115,168,184],"executable":[116],"simulation-and-selection":[117],"pipeline":[118],"submit":[120],"predicted":[121],"collision":[122],"event":[123,199],"yields":[124],"in":[125],"specified":[126],"signal":[127],"regions.":[128],"These":[129],"predictions":[130],"with":[133,194],"standard":[134],"histogram":[135],"metrics":[136],"that":[137,216],"provide":[138],"continuous":[139],"fidelity":[140],"scores":[141],"without":[142],"hand-written":[144],"rubric.":[145],"We":[146,182,202],"also":[147],"report":[148],"computational":[150],"cost":[151],"incurred":[152],"each":[154],"per":[156],"task.":[157],"Finally,":[158],"evaluate":[160,203],"codebase":[162],"full":[164],"session":[165],"trace":[166],"judge":[170],"catch":[172],"qualitative":[173],"failure":[174],"modes":[175],"such":[176],"as":[177],"fabrications,":[178],"hallucinations":[179],"duplications.":[181],"release":[183],"initial":[185],"set":[186],"tasks":[188],"drawn":[189],"LHC":[191],"searches,":[192],"together":[193],"containerized":[196],"sandbox":[197],"simulation":[200],"tools.":[201],"across":[204],"capability":[206],"ladder":[207],"general":[209],"purpose":[210],"coding":[211],"agents.":[212],"Our":[213],"results":[214],"show":[215],"average":[218],"no":[219],"reliably":[221],"beats":[222],"physicist-in-the-loop":[224],"solution.":[225]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-16T00:00:00"}
