{"id":"https://openalex.org/W7128532971","doi":"https://doi.org/10.48550/arxiv.2602.07150","title":"On Randomness in Agentic Evals","display_name":"On Randomness in Agentic Evals","publication_year":2026,"publication_date":"2026-02-06","ids":{"openalex":"https://openalex.org/W7128532971","doi":"https://doi.org/10.48550/arxiv.2602.07150"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.07150","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125592231","display_name":"Bjarni Haukur Bjarnason","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Bjarnason, Bjarni Haukur","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125484953","display_name":"Andr\u00e9 Silva","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Silva, Andr\u00e9","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Monperrus, Martin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Monperrus, Martin","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5125592231"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.12449999898672104,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.12449999898672104,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11574","display_name":"Artificial Intelligence in Games","score":0.1111999973654747,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.05139999836683273,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/randomness","display_name":"Randomness","score":0.7552000284194946},{"id":"https://openalex.org/keywords/variance","display_name":"Variance (accounting)","score":0.5548999905586243},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.4390000104904175},{"id":"https://openalex.org/keywords/statistical-hypothesis-testing","display_name":"Statistical hypothesis testing","score":0.4124000072479248},{"id":"https://openalex.org/keywords/statistical-power","display_name":"Statistical power","score":0.3944000005722046},{"id":"https://openalex.org/keywords/power","display_name":"Power (physics)","score":0.3758000135421753},{"id":"https://openalex.org/keywords/percentage-point","display_name":"Percentage point","score":0.36559998989105225}],"concepts":[{"id":"https://openalex.org/C125112378","wikidata":"https://www.wikidata.org/wiki/Q176640","display_name":"Randomness","level":2,"score":0.7552000284194946},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.5548999905586243},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.46230000257492065},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.45579999685287476},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.45179998874664307},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.4390000104904175},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.41760000586509705},{"id":"https://openalex.org/C87007009","wikidata":"https://www.wikidata.org/wiki/Q210832","display_name":"Statistical hypothesis testing","level":2,"score":0.4124000072479248},{"id":"https://openalex.org/C96608239","wikidata":"https://www.wikidata.org/wiki/Q1199823","display_name":"Statistical power","level":2,"score":0.3944000005722046},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.3758000135421753},{"id":"https://openalex.org/C44648626","wikidata":"https://www.wikidata.org/wiki/Q1049848","display_name":"Percentage point","level":2,"score":0.36559998989105225},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.31189998984336853},{"id":"https://openalex.org/C22679943","wikidata":"https://www.wikidata.org/wiki/Q159375","display_name":"Standard deviation","level":2,"score":0.3095000088214874},{"id":"https://openalex.org/C34146451","wikidata":"https://www.wikidata.org/wiki/Q5048094","display_name":"Cascade","level":2,"score":0.29109999537467957},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.28130000829696655},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.2702000141143799},{"id":"https://openalex.org/C21080849","wikidata":"https://www.wikidata.org/wiki/Q13611879","display_name":"Data point","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.26820001006126404},{"id":"https://openalex.org/C2986587452","wikidata":"https://www.wikidata.org/wiki/Q938438","display_name":"Statistical analysis","level":2,"score":0.2597000002861023}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.07150","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.07150","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.07150","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.07150","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Agentic":[0],"systems":[1],"are":[2,200],"evaluated":[3],"on":[4,43,66],"benchmarks":[5],"where":[6],"agents":[7],"interact":[8],"with":[9,71,184],"environments":[10],"to":[11,61,160,167,186],"solve":[12],"tasks.":[13],"Most":[14],"papers":[15],"report":[16],"a":[17,22,30],"pass@1":[18,56,143],"score":[19],"computed":[20],"from":[21,144,207],"single":[23],"run":[24,68],"per":[25,148],"task,":[26,149],"assuming":[27],"this":[28,36],"gives":[29],"reliable":[31,131],"performance":[32,191],"estimate.":[33],"We":[34,51],"test":[35],"assumption":[37],"by":[38,59],"collecting":[39],"60,000":[40],"agentic":[41,134],"trajectories":[42,108],"SWE-Bench-Verified,":[44],"spanning":[45],"three":[46,138],"models":[47],"and":[48,119,172,180],"two":[49],"scaffolds.":[50],"find":[52],"substantial":[53],"variance:":[54],"single-run":[55],"estimates":[57],"vary":[58],"2.2":[60],"6.0":[62],"percentage":[63,76,91],"points":[64,77,92],"depending":[65],"which":[67],"is":[69],"selected,":[70],"standard":[72],"deviations":[73],"exceeding":[74],"1.5":[75],"even":[78],"at":[79],"temperature":[80],"0.":[81],"This":[82],"variance":[83],"has":[84],"critical":[85],"implications:":[86],"reported":[87],"improvements":[88],"of":[89,117,133,164],"2--3":[90],"may":[93],"reflect":[94],"evaluation":[95,132,197],"noise":[96],"rather":[97],"than":[98],"genuine":[99,204],"algorithmic":[100],"progress.":[101],"Through":[102],"token-level":[103],"analysis,":[104],"we":[105,136],"show":[106],"that":[107,120],"diverge":[109],"early,":[110],"often":[111],"within":[112],"the":[113,162,189],"first":[114],"few":[115],"percent":[116],"tokens,":[118],"these":[121,194],"small":[122,153],"differences":[123],"cascade":[124],"into":[125],"different":[126],"solution":[127],"strategies.":[128],"To":[129],"enable":[130],"systems,":[135],"recommend":[137],"concrete":[139],"practices:":[140],"(1)":[141],"estimate":[142],"multiple":[145],"independent":[146],"runs":[147,165],"especially":[150],"when":[151],"measuring":[152],"improvements,":[154],"(2)":[155],"use":[156],"statistical":[157,208],"power":[158],"analysis":[159],"determine":[161],"number":[163],"needed":[166],"detect":[168],"expected":[169],"effect":[170],"sizes,":[171],"(3)":[173],"consider":[174],"metrics":[175],"like":[176],"pass@k":[177],"(optimistic":[178],"bound)":[179,183],"pass^k":[181],"(pessimistic":[182],"k&gt;1":[185],"better":[187],"characterize":[188],"full":[190],"envelope.":[192],"While":[193],"practices":[195],"increase":[196],"cost,":[198],"they":[199],"essential":[201],"for":[202],"distinguishing":[203],"scientific":[205],"progress":[206],"noise.":[209]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-11T00:00:00"}
