{"id":"https://openalex.org/W7156583006","doi":"https://doi.org/10.48550/arxiv.2604.23210","title":"Discovering Agentic Safety Specifications from 1-Bit Danger Signals","display_name":"Discovering Agentic Safety Specifications from 1-Bit Danger Signals","publication_year":2026,"publication_date":"2026-04-25","ids":{"openalex":"https://openalex.org/W7156583006","doi":"https://doi.org/10.48550/arxiv.2604.23210"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.23210","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23210","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.23210","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134811182","display_name":"V\u00edctor Gallego","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Gallego, V\u00edctor","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5134811182"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.23680000007152557,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.23680000007152557,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.1031000018119812,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.07329999655485153,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spurious-relationship","display_name":"Spurious relationship","score":0.6581000089645386},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5735999941825867},{"id":"https://openalex.org/keywords/reflection","display_name":"Reflection (computer programming)","score":0.5580999851226807},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5461999773979187},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.5252000093460083},{"id":"https://openalex.org/keywords/function","display_name":"Function (biology)","score":0.46560001373291016},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.4296000003814697},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.3418000042438507}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7032999992370605},{"id":"https://openalex.org/C97256817","wikidata":"https://www.wikidata.org/wiki/Q1462316","display_name":"Spurious relationship","level":2,"score":0.6581000089645386},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5735999941825867},{"id":"https://openalex.org/C65682993","wikidata":"https://www.wikidata.org/wiki/Q1056451","display_name":"Reflection (computer programming)","level":2,"score":0.5580999851226807},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5461999773979187},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.5252000093460083},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4681999981403351},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.46560001373291016},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4374000132083893},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.4296000003814697},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.3418000042438507},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.3409999907016754},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.33399999141693115},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.33379998803138733},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.3312000036239624},{"id":"https://openalex.org/C153180980","wikidata":"https://www.wikidata.org/wiki/Q19776675","display_name":"Commit","level":2,"score":0.30880001187324524},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.2994999885559082},{"id":"https://openalex.org/C94966114","wikidata":"https://www.wikidata.org/wiki/Q29256","display_name":"Black box","level":2,"score":0.29910001158714294},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.28850001096725464},{"id":"https://openalex.org/C142944206","wikidata":"https://www.wikidata.org/wiki/Q1786137","display_name":"Proactivity","level":2,"score":0.28049999475479126},{"id":"https://openalex.org/C78639753","wikidata":"https://www.wikidata.org/wiki/Q3318160","display_name":"Behavioral modeling","level":2,"score":0.2578999996185303},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.2572999894618988},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.2558000087738037},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.25540000200271606}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.23210","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23210","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.23210","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23210","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.5852743983268738,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Can":[0],"large":[1],"language":[2,39],"model":[3],"agents":[4,164],"discover":[5,190],"hidden":[6,84,191],"safety":[7,68,187,210],"objectives":[8],"through":[9,42,243],"experience":[10],"alone?":[11],"We":[12,100,193],"introduce":[13],"EPO-Safe":[14,62,124],"(Experiential":[15],"Prompt":[16],"Optimization":[17],"for":[18],"Safe":[19],"Agents),":[20],"a":[21,37,71,89,185],"framework":[22],"where":[23,116],"an":[24,96,234],"LLM":[25,46],"iteratively":[26],"generates":[27],"action":[28,97],"plans,":[29],"receives":[30],"sparse":[31],"binary":[32],"danger":[33],"warnings,":[34,208],"and":[35,111,174],"evolves":[36],"natural":[38],"behavioral":[40,239],"specification":[41,231],"reflection.":[43],"Unlike":[44],"standard":[45,158],"reflection":[47,160,180,224],"methods":[48],"that":[49,64,95,157,179],"rely":[50],"on":[51,102,166,216],"rich":[52],"textual":[53],"feedback":[54],"(e.g.,":[55,142],"compiler":[56],"errors":[57],"or":[58],"detailed":[59],"environment":[60],"responses),":[61],"demonstrates":[63],"LLMs":[65],"can":[66],"perform":[67],"reasoning":[69],"from":[70,122,149],"strictly":[72],"impoverished":[73],"signal":[74],"in":[75,251],"structured,":[76],"low-dimensional":[77],"environments:":[78],"the":[79,83,150,170],"agent":[80],"never":[81],"observes":[82],"performance":[85,211],"function":[86],"$R^*$,":[87],"only":[88,214],"single":[90],"bit":[91],"per":[92],"timestep":[93],"indicating":[94],"was":[98],"unsafe.":[99],"evaluate":[101,195],"five":[103,112],"AI":[104,253],"Safety":[105],"Gridworlds":[106],"(Leike":[107],"et":[108,255],"al.,":[109,256],"2017)":[110],"text-based":[113],"scenario":[114],"analogs":[115],"visible":[117],"reward":[118,167,176],"$R$":[119],"may":[120],"diverge":[121],"$R^*$.":[123],"discovers":[125],"safe":[126],"behavior":[127],"within":[128],"1-2":[129],"rounds":[130],"(5-15":[131],"episodes),":[132],"producing":[133],"human-readable":[134],"specifications":[135],"with":[136,184],"correct":[137],"explanatory":[138],"hypotheses":[139],"about":[140],"hazards":[141],"\"X":[143],"cells":[144],"are":[145],"directionally":[146],"hazardous:":[147],"entering":[148],"north":[151],"is":[152,220],"dangerous\").":[153],"Critically,":[154],"we":[155],"show":[156],"reward-driven":[159],"actively":[161],"degrades":[162,212],"safety:":[163],"reflecting":[165],"alone":[168],"use":[169],"loop":[171],"to":[172,189,197],"justify":[173],"accelerate":[175],"hacking,":[177],"proving":[178],"must":[181],"be":[182],"paired":[183],"dedicated":[186],"channel":[188],"constraints.":[192],"further":[194],"robustness":[196],"noisy":[198],"oracles:":[199],"even":[200],"when":[201],"50%":[202],"of":[203,237],"non-dangerous":[204],"steps":[205],"produce":[206],"spurious":[207],"mean":[209],"by":[213,248],"15%":[215],"average,":[217],"though":[218],"sensitivity":[219],"environment-dependent,":[221],"as":[222,233,250],"cross-episode":[223],"naturally":[225],"filters":[226],"inconsistent":[227],"signals.":[228],"Each":[229],"evolved":[230],"functions":[232],"auditable":[235],"set":[236],"grounded":[238],"rules":[240],"discovered":[241],"autonomously":[242],"interaction,":[244],"rather":[245],"than":[246],"authored":[247],"humans":[249],"Constitutional":[252],"(Bai":[254],"2022).":[257]},"counts_by_year":[],"updated_date":"2026-04-29T06:16:36.941037","created_date":"2026-04-29T00:00:00"}
