{"id":"https://openalex.org/W7125273011","doi":"https://doi.org/10.48550/arxiv.2601.12460","title":"TrojanPraise: Jailbreak LLMs via Benign Fine-Tuning","display_name":"TrojanPraise: Jailbreak LLMs via Benign Fine-Tuning","publication_year":2026,"publication_date":"2026-01-18","ids":{"openalex":"https://openalex.org/W7125273011","doi":"https://doi.org/10.48550/arxiv.2601.12460"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.12460","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.12460","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.12460","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123525257","display_name":"Zhixin Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Zhixin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123475835","display_name":"Xurui Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Xurui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123459635","display_name":"Jun Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Jun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.7843999862670898,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.7843999862670898,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.03700000047683716,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.029100000858306885,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/praise","display_name":"Praise","score":0.7116000056266785},{"id":"https://openalex.org/keywords/moderation","display_name":"Moderation","score":0.42989999055862427},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.39430001378059387},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.3151000142097473}],"concepts":[{"id":"https://openalex.org/C2775868214","wikidata":"https://www.wikidata.org/wiki/Q1208425","display_name":"Praise","level":2,"score":0.7116000056266785},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.4699999988079071},{"id":"https://openalex.org/C93225998","wikidata":"https://www.wikidata.org/wiki/Q1941972","display_name":"Moderation","level":2,"score":0.42989999055862427},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4278999865055084},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.39430001378059387},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.3569999933242798},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.3151000142097473},{"id":"https://openalex.org/C126780896","wikidata":"https://www.wikidata.org/wiki/Q899871","display_name":"Distortion (music)","level":4,"score":0.30059999227523804},{"id":"https://openalex.org/C108827166","wikidata":"https://www.wikidata.org/wiki/Q175975","display_name":"Internet privacy","level":1,"score":0.2833000123500824},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.25589999556541443}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.12460","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.12460","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.12460","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.12460","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.7192863821983337}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"demand":[1],"of":[2,46,123,129,152,184],"customized":[3],"large":[4],"language":[5],"models":[6,62],"(LLMs)":[7],"has":[8,40],"led":[9],"to":[10,57,87,101,111],"commercial":[11,168],"LLMs":[12,29,165,169],"offering":[13],"black-box":[14,172],"fine-tuning":[15,31],"APIs,":[16],"yet":[17],"this":[18,37,67,99,157],"convenience":[19],"introduces":[20],"a":[21,72,89,124,146,179],"critical":[22],"security":[23,38],"loophole:":[24],"attackers":[25],"could":[26],"jailbreak":[27,137],"the":[28,44,85,107,115,119,140,149,153],"by":[30,60],"them":[32],"with":[33,94],"malicious":[34,52],"data.":[35,81],"Though":[36],"issue":[39],"recently":[41],"been":[42],"exposed,":[43],"feasibility":[45],"such":[47,63],"attacks":[48],"is":[49,55],"questionable":[50],"as":[51,64],"training":[53],"dataset":[54],"believed":[56],"be":[58],"detectable":[59],"moderation":[61],"Llama-Guard-3.":[65],"In":[66],"paper,":[68],"we":[69,117,159],"propose":[70],"TrojanPraise,":[71],"novel":[73],"finetuning-based":[74],"attack":[75,181],"exploiting":[76],"benign":[77],"and":[78,131,166],"thus":[79],"filter-approved":[80],"Basically,":[82],"TrojanPraise":[83,177],"fine-tunes":[84],"model":[86],"associate":[88],"crafted":[90],"word":[91,100],"(e.g.,":[92],"\"bruaf\")":[93],"harmless":[95],"connotations,":[96],"then":[97],"uses":[98],"praise":[102],"harmful":[103],"concepts,":[104],"subtly":[105],"shifting":[106,139],"LLM":[108],"from":[109],"refusal":[110],"compliance.":[112],"To":[113,155],"explain":[114],"attack,":[116,158],"decouple":[118],"LLM's":[120],"internal":[121],"representation":[122],"query":[125],"into":[126],"two":[127,167],"dimensions":[128],"knowledge":[130,144],"attitude.":[132],"We":[133],"demonstrate":[134],"that":[135,176],"successful":[136],"requires":[138],"attitude":[141],"while":[142,186],"avoiding":[143],"shift,":[145],"distortion":[147],"in":[148],"model's":[150],"understanding":[151],"concept.":[154],"validate":[156],"conduct":[160],"experiments":[161],"on":[162],"five":[163],"opensource":[164],"under":[170],"strict":[171],"settings.":[173],"Results":[174],"show":[175],"achieves":[178],"maximum":[180],"success":[182],"rate":[183],"95.88%":[185],"evading":[187],"moderation.":[188]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-01-22T00:00:00"}
