{"id":"https://openalex.org/W7154358148","doi":"https://doi.org/10.48550/arxiv.2604.11666","title":"Playing Along: Learning a Double-Agent Defender for Belief Steering via Theory of Mind","display_name":"Playing Along: Learning a Double-Agent Defender for Belief Steering via Theory of Mind","publication_year":2026,"publication_date":"2026-04-13","ids":{"openalex":"https://openalex.org/W7154358148","doi":"https://doi.org/10.48550/arxiv.2604.11666"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.11666","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11666","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.11666","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5081274788","display_name":"Hanqi Xiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiao, Hanqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128089321","display_name":"Vaidehi Patil","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Patil, Vaidehi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133585092","display_name":"Zaid Khan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Khan, Zaid","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133565164","display_name":"Hyunji Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Hyunji","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038276570","display_name":"Elias Stengel-Eskin","orcid":"https://orcid.org/0000-0002-6689-505X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stengel-Eskin, Elias","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133609418","display_name":"Mohit Bansal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bansal, Mohit","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.25850000977516174,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.25850000977516174,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.22439999878406525,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.10599999874830246,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.7348999977111816},{"id":"https://openalex.org/keywords/theory-of-mind","display_name":"Theory of mind","score":0.6567000150680542},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5048999786376953},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.4343999922275543},{"id":"https://openalex.org/keywords/false-belief","display_name":"False belief","score":0.41589999198913574},{"id":"https://openalex.org/keywords/imitation","display_name":"Imitation","score":0.350600004196167},{"id":"https://openalex.org/keywords/simulation-theory-of-empathy","display_name":"Simulation theory of empathy","score":0.30410000681877136}],"concepts":[{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.7348999977111816},{"id":"https://openalex.org/C2779560602","wikidata":"https://www.wikidata.org/wiki/Q639219","display_name":"Theory of mind","level":3,"score":0.6567000150680542},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.5509999990463257},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5048999786376953},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.4343999922275543},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.42989999055862427},{"id":"https://openalex.org/C2994481395","wikidata":"https://www.wikidata.org/wiki/Q639219","display_name":"False belief","level":4,"score":0.41589999198913574},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.40630000829696655},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.3797000050544739},{"id":"https://openalex.org/C126388530","wikidata":"https://www.wikidata.org/wiki/Q1131737","display_name":"Imitation","level":2,"score":0.350600004196167},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3190000057220459},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.31610000133514404},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.3118000030517578},{"id":"https://openalex.org/C190264587","wikidata":"https://www.wikidata.org/wiki/Q861691","display_name":"Simulation theory of empathy","level":3,"score":0.30410000681877136},{"id":"https://openalex.org/C2779267917","wikidata":"https://www.wikidata.org/wiki/Q170028","display_name":"Deception","level":2,"score":0.29499998688697815},{"id":"https://openalex.org/C60692881","wikidata":"https://www.wikidata.org/wiki/Q584529","display_name":"Humanoid robot","level":3,"score":0.287200003862381},{"id":"https://openalex.org/C167981075","wikidata":"https://www.wikidata.org/wiki/Q2667186","display_name":"Sandbox (software development)","level":2,"score":0.2840999960899353},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.27559998631477356},{"id":"https://openalex.org/C108827166","wikidata":"https://www.wikidata.org/wiki/Q175975","display_name":"Internet privacy","level":1,"score":0.2728999853134155},{"id":"https://openalex.org/C31084985","wikidata":"https://www.wikidata.org/wiki/Q372650","display_name":"Common knowledge (logic)","level":5,"score":0.2603999972343445},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.2581999897956848},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.25589999556541443}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.11666","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11666","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.11666","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11666","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.6757926344871521,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"large":[1],"language":[2],"models":[3,116,154],"(LLMs)":[4],"become":[5],"the":[6,16,66,83,93,100,143,243,280],"engine":[7],"behind":[8],"conversational":[9],"systems,":[10],"their":[11,21],"ability":[12],"to":[13,64,126,140,157,271,276],"reason":[14,141],"about":[15,142],"intentions":[17],"and":[18,26,88,119,169,181,189,204,207,217,239,246,251,264,279],"states":[19],"of":[20,68,92,98,228,282],"dialogue":[22],"partners":[23],"(i.e.,":[24],"form":[25,89],"use":[27],"a":[28,44,56,61,76,90,96,175,225],"theory-of-mind,":[29],"or":[30],"ToM)":[31],"becomes":[32],"increasingly":[33],"critical":[34],"for":[35,50],"safe":[36],"interaction":[37],"with":[38,71,87,95,132,198,253],"potentially":[39],"adversarial":[40],"partners.":[41],"We":[42,111,259],"propose":[43],"novel":[45],"privacy-themed":[46],"ToM":[47,49,91,170,180,191,216,238,247,254],"challenge,":[48],"Steering":[51],"Beliefs":[52],"(ToM-SB),":[53],"in":[54,107,129,215],"which":[55],"defender":[57,84,202],"must":[58,85],"act":[59,158],"as":[60,159,224],"Double":[62,161,233,266],"Agent":[63],"steer":[65],"beliefs":[67,145],"an":[69],"attacker":[70,101,134],"partial":[72,133],"prior":[73,135],"knowledge":[74],"within":[75],"shared":[77],"universe.":[78],"To":[79,148],"succeed":[80],"on":[81,122,155,230,256],"ToM-SB,":[82,123],"engage":[86],"attacker,":[94],"goal":[97],"fooling":[99,168,184,240,245],"into":[102],"believing":[103],"they":[104],"have":[105],"succeeded":[106],"extracting":[108],"sensitive":[109],"information.":[110],"find":[112,174,212],"that":[113,213,235,262],"strong":[114],"frontier":[115],"like":[117],"Gemini3-Pro":[118,250],"GPT-5.4":[120,252],"struggle":[121],"often":[124],"failing":[125],"fool":[127],"attackers":[128,197],"hard":[130,257],"scenarios":[131],"knowledge,":[136],"even":[137],"when":[138],"prompted":[139],"attacker's":[144],"(ToM":[146],"prompting).":[147],"close":[149],"this":[150],"gap,":[151],"we":[152,173,211],"train":[153],"ToM-SB":[156,263],"AI":[160,232,265],"Agents":[162,234,267],"using":[163],"reinforcement":[164],"learning,":[165],"testing":[166],"both":[167,205,237],"rewards.":[171],"Notably,":[172],"bidirectionally":[176],"emergent":[177],"relationship":[178],"between":[179],"attacker-fooling:":[182],"rewarding":[183,190],"success":[185,229],"alone":[186,192],"improves":[187,193],"ToM,":[188],"fooling.":[194],"Across":[195],"four":[196],"different":[199],"strengths,":[200],"six":[201],"methods,":[203],"in-distribution":[206],"out-of-distribution":[208],"(OOD)":[209],"evaluation,":[210],"gains":[214],"attacker-fooling":[218],"are":[219],"well-correlated,":[220],"highlighting":[221],"belief":[222],"modeling":[223],"key":[226],"driver":[227],"ToM-SB.":[231],"combine":[236],"rewards":[241],"yield":[242],"strongest":[244],"performance,":[248],"outperforming":[249],"prompting":[255],"scenarios.":[258],"also":[260],"show":[261],"can":[268],"be":[269],"extended":[270],"stronger":[272],"attackers,":[273],"demonstrating":[274],"generalization":[275],"OOD":[277],"settings":[278],"upgradability":[281],"our":[283],"task.":[284]},"counts_by_year":[],"updated_date":"2026-07-01T08:55:40.977307","created_date":"2026-04-15T00:00:00"}
