{"id":"https://openalex.org/W7160313756","doi":"https://doi.org/10.48550/arxiv.2605.01899","title":"Disentangling Intent from Role: Adversarial Self-Play for Persona-Invariant Safety Alignment","display_name":"Disentangling Intent from Role: Adversarial Self-Play for Persona-Invariant Safety Alignment","publication_year":2026,"publication_date":"2026-05-03","ids":{"openalex":"https://openalex.org/W7160313756","doi":"https://doi.org/10.48550/arxiv.2605.01899"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.01899","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.01899","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.01899","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135378470","display_name":"Jiajia Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Jiajia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135294924","display_name":"Xiaoyu Wen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wen, Xiaoyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135405168","display_name":"Zhongtian Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Zhongtian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135377496","display_name":"Shuyue Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Shuyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135367386","display_name":"Qiaosheng Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Qiaosheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135336883","display_name":"Zhen Wang","orcid":"https://orcid.org/0000-0002-0846-5334"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T14074","display_name":"Persona Design and Applications","score":0.8669000267982483,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T14074","display_name":"Persona Design and Applications","score":0.8669000267982483,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.019200000911951065,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.009499999694526196,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.8881000280380249},{"id":"https://openalex.org/keywords/persona","display_name":"Persona","score":0.6855999827384949},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5931000113487244},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5483999848365784},{"id":"https://openalex.org/keywords/constraint","display_name":"Constraint (computer-aided design)","score":0.4636000096797943},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.4212999939918518}],"concepts":[{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.8881000280380249},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6880000233650208},{"id":"https://openalex.org/C313442","wikidata":"https://www.wikidata.org/wiki/Q778556","display_name":"Persona","level":2,"score":0.6855999827384949},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5931000113487244},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5483999848365784},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.5182999968528748},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.4636000096797943},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.4212999939918518},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.3749000132083893},{"id":"https://openalex.org/C140547941","wikidata":"https://www.wikidata.org/wiki/Q7797194","display_name":"Threat model","level":2,"score":0.36309999227523804},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33570000529289246},{"id":"https://openalex.org/C187191949","wikidata":"https://www.wikidata.org/wiki/Q1138496","display_name":"Profiling (computer programming)","level":2,"score":0.2930000126361847},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.2773999869823456},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2741999924182892},{"id":"https://openalex.org/C33762810","wikidata":"https://www.wikidata.org/wiki/Q461671","display_name":"Data integrity","level":2,"score":0.2653999924659729},{"id":"https://openalex.org/C2780554381","wikidata":"https://www.wikidata.org/wiki/Q2063340","display_name":"Sensemaking","level":2,"score":0.2542000114917755}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.01899","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.01899","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.01899","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.01899","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.535083532333374,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"growing":[1],"capabilities":[2],"of":[3,111,164],"large":[4],"language":[5],"models":[6,28],"(LLMs)":[7],"have":[8],"driven":[9],"their":[10],"widespread":[11],"deployment":[12],"across":[13],"diverse":[14],"domains,":[15],"even":[16],"in":[17,23,96],"potentially":[18],"high-risk":[19,132],"scenarios.":[20],"Despite":[21],"advances":[22],"safety":[24,112],"alignment":[25,166],"techniques,":[26],"current":[27],"remain":[29],"vulnerable":[30],"to":[31,106],"emerging":[32],"persona-based":[33,39,122],"jailbreak":[34,40,123],"attacks.":[35,124],"Existing":[36],"research":[37],"on":[38,44,54,79,88],"has":[41],"primarily":[42],"focused":[43],"attack":[45,81],"iterations,":[46],"yet":[47],"it":[48],"lacks":[49],"systemic":[50],"and":[51,83,162],"mechanistic":[52],"constraints":[53],"the":[55,80,89,97,108,141,147,154,160],"defense":[56,90,143],"side.":[57,91],"To":[58],"address":[59],"this":[60,165],"challenge,":[61],"we":[62],"propose":[63],"Persona-Invariant":[64,84],"Alignment":[65],"(PIA),":[66],"an":[67],"adversarial":[68],"self-play":[69],"framework":[70],"that":[71,128],"achieves":[72],"co-evolution":[73],"through":[74],"Persona":[75],"Lineage":[76],"Evolution":[77],"(PLE)":[78],"side":[82],"Consistency":[85],"Learning":[86],"(PICL)":[87],"Theoretically,":[92],"PICL":[93,142],"is":[94],"grounded":[95],"structural":[98,109],"separation":[99],"hypothesis,":[100],"using":[101],"a":[102],"unilateral":[103],"KL-divergence":[104],"constraint":[105],"enable":[107],"decoupling":[110],"decisions":[113],"from":[114],"persona":[115,133],"context,":[116],"thereby":[117,158],"maintaining":[118],"safe":[119],"behavior":[120],"under":[121],"Experimental":[125],"results":[126],"demonstrate":[127],"PLE":[129],"efficiently":[130],"explores":[131],"spaces":[134],"by":[135],"leveraging":[136],"lineage-based":[137],"credit":[138],"propagation.":[139],"Meanwhile,":[140],"method":[144],"significantly":[145],"reduces":[146],"Attack":[148],"Success":[149],"Rate":[150],"(ASR)":[151],"while":[152],"preserving":[153],"model's":[155],"general":[156],"capability,":[157],"validating":[159],"superiority":[161],"robustness":[163],"paradigm.":[167],"Codes":[168],"are":[169],"available":[170],"at":[171],"https://github.com/JiajiaLi-1130/PIA.":[172]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-06T00:00:00"}
