{"id":"https://openalex.org/W7130615426","doi":"https://doi.org/10.48550/arxiv.2602.16666","title":"Towards a Science of AI Agent Reliability","display_name":"Towards a Science of AI Agent Reliability","publication_year":2026,"publication_date":"2026-02-18","ids":{"openalex":"https://openalex.org/W7130615426","doi":"https://doi.org/10.48550/arxiv.2602.16666"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.16666","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030986512","display_name":"Stephan Rabanser","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Rabanser, Stephan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126439331","display_name":"Sayash Kapoor","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kapoor, Sayash","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126385549","display_name":"Peter Kirgis","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kirgis, Peter","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088475090","display_name":"K. Liu","orcid":"https://orcid.org/0000-0002-6491-5862"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Kangheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126364523","display_name":"Saiteja Utpala","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Utpala, Saiteja","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5107529540","display_name":"ARVIND NARAYANAN","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Narayanan, Arvind","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5030986512"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.31869998574256897,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.31869998574256897,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.28029999136924744,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.1225999966263771,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.6586999893188477},{"id":"https://openalex.org/keywords/complement","display_name":"Complement (music)","score":0.6396999955177307},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.597000002861023},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.5817000269889832},{"id":"https://openalex.org/keywords/bounded-function","display_name":"Bounded function","score":0.4020000100135803},{"id":"https://openalex.org/keywords/multi-agent-system","display_name":"Multi-agent system","score":0.3264999985694885}],"concepts":[{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.6586999893188477},{"id":"https://openalex.org/C112313634","wikidata":"https://www.wikidata.org/wiki/Q7886648","display_name":"Complement (music)","level":5,"score":0.6396999955177307},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6363000273704529},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.597000002861023},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.5817000269889832},{"id":"https://openalex.org/C34388435","wikidata":"https://www.wikidata.org/wiki/Q2267362","display_name":"Bounded function","level":2,"score":0.4020000100135803},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3953000009059906},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.3937000036239624},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.36640000343322754},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.362199991941452},{"id":"https://openalex.org/C41550386","wikidata":"https://www.wikidata.org/wiki/Q529909","display_name":"Multi-agent system","level":2,"score":0.3264999985694885},{"id":"https://openalex.org/C74072328","wikidata":"https://www.wikidata.org/wiki/Q1142726","display_name":"Intelligent agent","level":2,"score":0.29840001463890076},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.29580000042915344},{"id":"https://openalex.org/C2780873155","wikidata":"https://www.wikidata.org/wiki/Q392811","display_name":"Agent-based model","level":2,"score":0.2696000039577484},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.26010000705718994},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.25220000743865967}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.16666","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.16666","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.16666","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.16666","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"AI":[0],"agents":[1,20,52,131],"are":[2],"increasingly":[3],"deployed":[4],"to":[5,23],"execute":[6],"important":[7],"tasks.":[8],"While":[9],"rising":[10],"accuracy":[11],"scores":[12],"on":[13],"standard":[14],"benchmarks":[15],"suggest":[16],"rapid":[17],"progress,":[18],"many":[19],"still":[21],"continue":[22],"fail":[24,59],"in":[25,67,112],"practice.":[26],"This":[27],"discrepancy":[28],"highlights":[29],"a":[30,40,72],"fundamental":[31],"limitation":[32],"of":[33],"current":[34],"evaluations:":[35],"compressing":[36],"agent":[37,83],"behavior":[38],"into":[39],"single":[41],"success":[42],"metric":[43],"obscures":[44],"critical":[45],"operational":[46],"flaws.":[47],"Notably,":[48],"it":[49],"ignores":[50],"whether":[51],"behave":[53],"consistently":[54],"across":[55,97],"runs,":[56],"withstand":[57],"perturbations,":[58],"predictably,":[60],"or":[61],"have":[62,107],"bounded":[63],"error":[64],"severity.":[65],"Grounded":[66],"safety-critical":[68],"engineering,":[69],"we":[70,101],"provide":[71],"holistic":[73],"performance":[74],"profile":[75],"by":[76],"proposing":[77],"twelve":[78],"concrete":[79],"metrics":[80,120],"that":[81,103],"decompose":[82],"reliability":[84],"along":[85],"four":[86],"key":[87],"dimensions:":[88],"consistency,":[89],"robustness,":[90],"predictability,":[91],"and":[92,134],"safety.":[93],"Evaluating":[94],"14":[95],"models":[96],"two":[98],"complementary":[99],"benchmarks,":[100],"find":[102],"recent":[104],"capability":[105],"gains":[106],"only":[108],"yielded":[109],"small":[110],"improvements":[111],"reliability.":[113],"By":[114],"exposing":[115],"these":[116],"persistent":[117],"limitations,":[118],"our":[119],"complement":[121],"traditional":[122],"evaluations":[123],"while":[124],"offering":[125],"tools":[126],"for":[127],"reasoning":[128],"about":[129],"how":[130],"perform,":[132],"degrade,":[133],"fail.":[135]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-20T00:00:00"}
