{"id":"https://openalex.org/W7129465018","doi":"https://doi.org/10.48550/arxiv.2602.13234","title":"Stay in Character, Stay Safe: Dual-Cycle Adversarial Self-Evolution for Safety Role-Playing Agents","display_name":"Stay in Character, Stay Safe: Dual-Cycle Adversarial Self-Evolution for Safety Role-Playing Agents","publication_year":2026,"publication_date":"2026-01-29","ids":{"openalex":"https://openalex.org/W7129465018","doi":"https://doi.org/10.48550/arxiv.2602.13234"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.13234","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126234834","display_name":"Mingyang Liao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liao, Mingyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103239887","display_name":"Yichen Wan","orcid":"https://orcid.org/0009-0001-6581-3198"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wan, Yichen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126216846","display_name":"shuchen wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"wu, shuchen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126276262","display_name":"Chenxi Miao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Miao, Chenxi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029007458","display_name":"Xin Shen","orcid":"https://orcid.org/0000-0002-8191-9654"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Xin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126266134","display_name":"Weikang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Weikang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126269021","display_name":"Yang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126272661","display_name":"Deguo Xia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xia, Deguo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126174260","display_name":"Jizhou Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Jizhou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T14074","display_name":"Persona Design and Applications","score":0.5361999869346619,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T14074","display_name":"Persona Design and Applications","score":0.5361999869346619,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.12409999966621399,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.04340000078082085,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.866100013256073},{"id":"https://openalex.org/keywords/persona","display_name":"Persona","score":0.6064000129699707},{"id":"https://openalex.org/keywords/hierarchy","display_name":"Hierarchy","score":0.6018999814987183},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5268999934196472},{"id":"https://openalex.org/keywords/vulnerability","display_name":"Vulnerability (computing)","score":0.507099986076355},{"id":"https://openalex.org/keywords/knowledge-base","display_name":"Knowledge base","score":0.40939998626708984},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.388700008392334}],"concepts":[{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.866100013256073},{"id":"https://openalex.org/C313442","wikidata":"https://www.wikidata.org/wiki/Q778556","display_name":"Persona","level":2,"score":0.6064000129699707},{"id":"https://openalex.org/C31170391","wikidata":"https://www.wikidata.org/wiki/Q188619","display_name":"Hierarchy","level":2,"score":0.6018999814987183},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5501000285148621},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.5374000072479248},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5268999934196472},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.507099986076355},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.40939998626708984},{"id":"https://openalex.org/C108827166","wikidata":"https://www.wikidata.org/wiki/Q175975","display_name":"Internet privacy","level":1,"score":0.3896999955177307},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.388700008392334},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.38499999046325684},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.33070001006126404},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.32350000739097595},{"id":"https://openalex.org/C127627568","wikidata":"https://www.wikidata.org/wiki/Q1639361","display_name":"Sociotechnical system","level":2,"score":0.30889999866485596},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.30169999599456787},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.27489998936653137},{"id":"https://openalex.org/C167063184","wikidata":"https://www.wikidata.org/wiki/Q1400839","display_name":"Vulnerability assessment","level":3,"score":0.2711000144481659},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.25870001316070557},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.25699999928474426},{"id":"https://openalex.org/C51485801","wikidata":"https://www.wikidata.org/wiki/Q16966861","display_name":"Efficient frontier","level":3,"score":0.25380000472068787},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.2515000104904175}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.13234","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.13234","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.13234","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.13234","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.7634605765342712}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"LLM-based":[0],"role-playing":[1],"has":[2],"rapidly":[3],"improved":[4],"in":[5],"fidelity,":[6],"yet":[7],"stronger":[8,83],"adherence":[9],"to":[10,16,45,125,133,163],"persona":[11,136],"constraints":[12],"commonly":[13],"increases":[14],"vulnerability":[15],"jailbreak":[17,84,158],"attacks,":[18],"especially":[19],"for":[20,61],"risky":[21],"or":[22,37],"negative":[23],"personas.":[24],"Most":[25],"prior":[26],"work":[27],"mitigates":[28],"this":[29,123],"issue":[30],"with":[31,73],"training-time":[32],"solutions":[33],"(e.g.,":[34],"data":[35],"curation":[36],"alignment-oriented":[38],"regularization).":[39],"However,":[40],"these":[41],"approaches":[42],"are":[43,58],"costly":[44],"maintain":[46],"as":[47],"personas":[48,165],"and":[49,57,107,118,157,160,166],"attack":[50,167],"strategies":[51],"evolve,":[52],"can":[53],"degrade":[54],"in-character":[55,110],"behavior,":[56],"typically":[59],"infeasible":[60],"frontier":[62],"closed-weight":[63],"LLMs.":[64],"We":[65],"propose":[66],"a":[67,87,95],"training-free":[68],"Dual-Cycle":[69],"Adversarial":[70],"Self-Evolution":[71],"framework":[72],"two":[74],"coupled":[75],"cycles.":[76],"A":[77],"Persona-Targeted":[78],"Attacker":[79],"Cycle":[80,90],"synthesizes":[81],"progressively":[82],"prompts,":[85],"while":[86,137],"Role-Playing":[88],"Defender":[89,116],"distills":[91],"observed":[92],"failures":[93],"into":[94],"hierarchical":[96],"knowledge":[97,121],"base":[98],"of":[99],"(i)":[100],"global":[101],"safety":[102,139],"rules,":[103],"(ii)":[104],"persona-grounded":[105],"constraints,":[106],"(iii)":[108],"safe":[109],"exemplars.":[111],"At":[112],"inference":[113],"time,":[114],"the":[115,134],"retrieves":[117],"composes":[119],"structured":[120],"from":[122],"hierarchy":[124],"guide":[126],"generation,":[127],"producing":[128],"responses":[129],"that":[130],"remain":[131],"faithful":[132],"target":[135],"satisfying":[138],"constraints.":[140],"Extensive":[141],"experiments":[142],"across":[143],"multiple":[144],"proprietary":[145],"LLMs":[146],"show":[147],"consistent":[148],"gains":[149],"over":[150],"strong":[151],"baselines":[152],"on":[153],"both":[154],"role":[155],"fidelity":[156],"resistance,":[159],"robust":[161],"generalization":[162],"unseen":[164],"prompts.":[168]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-18T00:00:00"}
