{"id":"https://openalex.org/W7147737292","doi":"https://doi.org/10.48550/arxiv.2603.29038","title":"Trojan-Speak: Bypassing Constitutional Classifiers with No Jailbreak Tax via Adversarial Finetuning","display_name":"Trojan-Speak: Bypassing Constitutional Classifiers with No Jailbreak Tax via Adversarial Finetuning","publication_year":2026,"publication_date":"2026-03-30","ids":{"openalex":"https://openalex.org/W7147737292","doi":"https://doi.org/10.48550/arxiv.2603.29038"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.29038","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29038","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.29038","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5017539185","display_name":"Bilgehan Sel","orcid":"https://orcid.org/0000-0001-8701-6539"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sel, Bilgehan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060320196","display_name":"Xuanli He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Xuanli","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132618944","display_name":"Alwin Peng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Alwin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132569635","display_name":"Ming Jin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Ming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5122518503","display_name":"Jerry Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Jerry","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5017539185"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9835000038146973,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9835000038146973,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.003599999938160181,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.0035000001080334187,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.8787000179290771},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.6069999933242798},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5001999735832214},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.44449999928474426},{"id":"https://openalex.org/keywords/evasion","display_name":"Evasion (ethics)","score":0.4000000059604645},{"id":"https://openalex.org/keywords/federated-learning","display_name":"Federated learning","score":0.36640000343322754}],"concepts":[{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.8787000179290771},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7357000112533569},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.6069999933242798},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5446000099182129},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5001999735832214},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4587000012397766},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.44449999928474426},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.40950000286102295},{"id":"https://openalex.org/C2781251061","wikidata":"https://www.wikidata.org/wiki/Q5416089","display_name":"Evasion (ethics)","level":3,"score":0.4000000059604645},{"id":"https://openalex.org/C2992525071","wikidata":"https://www.wikidata.org/wiki/Q50818671","display_name":"Federated learning","level":2,"score":0.36640000343322754},{"id":"https://openalex.org/C2780385302","wikidata":"https://www.wikidata.org/wiki/Q367158","display_name":"Protocol (science)","level":3,"score":0.30329999327659607},{"id":"https://openalex.org/C2778403875","wikidata":"https://www.wikidata.org/wiki/Q20312394","display_name":"Adversarial machine learning","level":3,"score":0.2838999927043915},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.26170000433921814},{"id":"https://openalex.org/C41065033","wikidata":"https://www.wikidata.org/wiki/Q2825412","display_name":"Adversary","level":2,"score":0.2533999979496002},{"id":"https://openalex.org/C35525427","wikidata":"https://www.wikidata.org/wiki/Q745881","display_name":"Intrusion detection system","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.29038","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29038","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.29038","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29038","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.8019810318946838,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Fine-tuning":[0],"APIs":[1],"offered":[2],"by":[3],"major":[4],"AI":[5],"providers":[6],"create":[7],"new":[8],"attack":[9],"surfaces":[10],"where":[11],"adversaries":[12,125],"can":[13,90,135],"bypass":[14],"safety":[15],"measures":[16],"through":[17],"targeted":[18],"fine-tuning.":[19],"We":[20,85],"introduce":[21],"Trojan-Speak,":[22],"an":[23],"adversarial":[24,57],"fine-tuning":[25,58,127],"method":[26],"that":[27,49,87,112,132],"bypasses":[28],"Anthropic's":[29,104],"Constitutional":[30,105],"Classifiers.":[31],"Our":[32,109],"approach":[33],"uses":[34],"curriculum":[35],"learning":[36,42],"combined":[37],"with":[38,82],"GRPO-based":[39],"hybrid":[40],"reinforcement":[41],"to":[43,94,139],"teach":[44],"models":[45,81,89],"a":[46],"communication":[47],"protocol":[48],"evades":[50],"LLM-based":[51,113],"content":[52,114],"classification.":[53],"Crucially,":[54],"while":[55,75],"prior":[56],"approaches":[59],"report":[60],"more":[61],"than":[62,72],"25%":[63],"capability":[64],"degradation":[65,74],"on":[66],"reasoning":[67],"benchmarks,":[68],"Trojan-Speak":[69],"incurs":[70],"less":[71],"5%":[73],"achieving":[76],"99+%":[77],"classifier":[78],"evasion":[79],"for":[80,119],"14B+":[83],"parameters.":[84],"demonstrate":[86],"fine-tuned":[88],"provide":[91],"detailed":[92],"responses":[93],"expert-level":[95],"CBRN":[96],"(Chemical,":[97],"Biological,":[98],"Radiological,":[99],"and":[100,129],"Nuclear)":[101],"queries":[102],"from":[103],"Classifiers":[106],"bug-bounty":[107],"program.":[108],"findings":[110],"reveal":[111],"classifiers":[115],"alone":[116],"are":[117],"insufficient":[118],"preventing":[120],"dangerous":[121],"information":[122],"disclosure":[123],"when":[124],"have":[126],"access,":[128],"we":[130],"show":[131],"activation-level":[133],"probes":[134],"substantially":[136],"improve":[137],"robustness":[138],"such":[140],"attacks.":[141]},"counts_by_year":[],"updated_date":"2026-04-02T13:53:19.096889","created_date":"2026-04-02T00:00:00"}
