{"id":"https://openalex.org/W7151486162","doi":"https://doi.org/10.48550/arxiv.2604.04341","title":"Implementing surrogate goals for safer bargaining in LLM-based agents","display_name":"Implementing surrogate goals for safer bargaining in LLM-based agents","publication_year":2026,"publication_date":"2026-04-06","ids":{"openalex":"https://openalex.org/W7151486162","doi":"https://doi.org/10.48550/arxiv.2604.04341"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.04341","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04341","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.04341","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5008125381","display_name":"Caspar Oesterheld","orcid":"https://orcid.org/0000-0003-4222-7855"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Oesterheld, Caspar","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085895682","display_name":"Maxime Rich\u00e9","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rich\u00e9, Maxime","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077209749","display_name":"Filip Sondej","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sondej, Filip","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010989287","display_name":"Jesse Clifton","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Clifton, Jesse","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133084669","display_name":"Vincent Conitzer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Conitzer, Vincent","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5008125381"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.2071000039577484,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.2071000039577484,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.17080000042915344,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.07890000194311142,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/safer","display_name":"SAFER","score":0.7774999737739563},{"id":"https://openalex.org/keywords/principal","display_name":"Principal (computer security)","score":0.5415999889373779},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.3560999929904938},{"id":"https://openalex.org/keywords/opportunity-cost","display_name":"Opportunity cost","score":0.2565999925136566}],"concepts":[{"id":"https://openalex.org/C2776654903","wikidata":"https://www.wikidata.org/wiki/Q2601463","display_name":"SAFER","level":2,"score":0.7774999737739563},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.6132000088691711},{"id":"https://openalex.org/C144559511","wikidata":"https://www.wikidata.org/wiki/Q2986279","display_name":"Principal (computer security)","level":2,"score":0.5415999889373779},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.49480000138282776},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.43849998712539673},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.4327000081539154},{"id":"https://openalex.org/C175444787","wikidata":"https://www.wikidata.org/wiki/Q39072","display_name":"Microeconomics","level":1,"score":0.3785000145435333},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.3560999929904938},{"id":"https://openalex.org/C162118730","wikidata":"https://www.wikidata.org/wiki/Q1128453","display_name":"Actuarial science","level":1,"score":0.3098999857902527},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.29190000891685486},{"id":"https://openalex.org/C68359772","wikidata":"https://www.wikidata.org/wiki/Q185715","display_name":"Opportunity cost","level":2,"score":0.2565999925136566}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.04341","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04341","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.04341","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04341","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7018088698387146,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Surrogate":[0],"goals":[1,108],"have":[2],"been":[3],"proposed":[4],"as":[5,91],"a":[6,20,118],"strategy":[7],"for":[8],"reducing":[9],"risks":[10],"from":[11,36,53,88],"bargaining":[12,58],"failures.":[13],"A":[14],"surrogate":[15,107,183],"goal":[16,18],"is":[17],"that":[19,28,158,206],"principal":[21,39],"can":[22,62],"give":[23],"an":[24],"AI":[25],"agent":[26,34,48,80,120],"and":[27,148,163,171,199],"deflects":[29],"any":[30],"threats":[31,124,180],"against":[32,181],"the":[33,38,76,79,100,129,152,176,182,188],"away":[35],"what":[37],"cares":[40,93],"about.":[41],"For":[42],"example,":[43],"one":[44],"might":[45],"make":[46],"one's":[47],"care":[49,83],"about":[50,85,94],"preventing":[51,86],"money":[52,67,73,87,95,127],"being":[54,89,96],"burned.":[55],"Then":[56],"in":[57,109,128,191,201],"interactions,":[59],"other":[60,202],"agents":[61],"threaten":[63],"to":[64,71,74,82,98,116,121,123,135],"burn":[65],"their":[66,194],"instead":[68],"of":[69,125,145,193],"threatening":[70],"spending":[72],"hurt":[75,99],"principal.":[77,101],"Importantly,":[78],"has":[81],"equally":[84],"burned":[90],"it":[92,132],"spent":[97],"In":[102,112,168],"this":[103],"paper,":[104],"we":[105,114],"implement":[106,175],"language-model-based":[110,119],"agents.":[111],"particular,":[113,169],"try":[115],"get":[117],"react":[122,134],"burning":[126],"same":[130],"way":[131],"would":[133],"\"normal\"":[136],"threats.":[137],"We":[138,150,156,185,204],"propose":[139],"four":[140,153],"different":[141,189],"methods,":[142],"using":[143],"techniques":[144],"prompting,":[146],"fine-tuning,":[147],"scaffolding.":[149],"evaluate":[151],"methods":[154,159,190,208],"experimentally.":[155],"find":[157,205],"based":[160],"on":[161,197],"scaffolding":[162,172],"fine-tuning":[164,170],"outperform":[165],"simple":[166],"prompting.":[167],"more":[173],"precisely":[174],"desired":[177],"behavior":[178],"w.r.t.":[179],"goal.":[184],"also":[186],"compare":[187],"terms":[192],"side":[195],"effects":[196],"capabilities":[198],"propensities":[200],"situations.":[203],"scaffolding-based":[207],"perform":[209],"best.":[210]},"counts_by_year":[],"updated_date":"2026-04-08T06:07:18.267832","created_date":"2026-04-08T00:00:00"}
