{"id":"https://openalex.org/W7133232659","doi":"https://doi.org/10.48550/arxiv.2602.23391","title":"Detoxifying LLMs via Representation Erasure-Based Preference Optimization","display_name":"Detoxifying LLMs via Representation Erasure-Based Preference Optimization","publication_year":2026,"publication_date":"2026-02-24","ids":{"openalex":"https://openalex.org/W7133232659","doi":"https://doi.org/10.48550/arxiv.2602.23391"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.23391","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5084163687","display_name":"Nazanin Mohammadi Sepahvand","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sepahvand, Nazanin Mohammadi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127843823","display_name":"Eleni Triantafillou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Triantafillou, Eleni","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039477518","display_name":"Hugol Larochelle","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Larochelle, Hugo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127784613","display_name":"Doina Precup","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Precup, Doina","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127781334","display_name":"Daniel M. Roy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Roy, Daniel M.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5021062973","display_name":"Gintare Karolina Dziugaite","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dziugaite, Gintare Karolina","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5084163687"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.95169997215271,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.95169997215271,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.005900000222027302,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.00430000014603138,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.8156999945640564},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.7365000247955322},{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.6204000115394592},{"id":"https://openalex.org/keywords/preference-elicitation","display_name":"Preference elicitation","score":0.30230000615119934},{"id":"https://openalex.org/keywords/revealed-preference","display_name":"Revealed preference","score":0.28200000524520874}],"concepts":[{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.8156999945640564},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.7365000247955322},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.6204000115394592},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6180999875068665},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6028000116348267},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4555000066757202},{"id":"https://openalex.org/C2777868144","wikidata":"https://www.wikidata.org/wiki/Q7239817","display_name":"Preference elicitation","level":3,"score":0.30230000615119934},{"id":"https://openalex.org/C2779110102","wikidata":"https://www.wikidata.org/wiki/Q1323737","display_name":"Revealed preference","level":2,"score":0.28200000524520874},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.26600000262260437},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2590999901294708},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.2581000030040741}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.23391","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.23391","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.23391","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.23391","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.5444419384002686}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"(LLMs)":[3],"trained":[4],"on":[5,20],"webscale":[6],"data":[7],"can":[8],"produce":[9],"toxic":[10,102],"outputs,":[11],"raising":[12],"concerns":[13],"for":[14],"safe":[15],"deployment.":[16],"Prior":[17],"defenses,":[18],"based":[19],"applications":[21],"of":[22,31,101],"DPO,":[23],"NPO,":[24],"and":[25,44,148,154],"similar":[26],"algorithms,":[27],"reduce":[28],"the":[29,59,99],"likelihood":[30],"harmful":[32,67],"continuations,":[33],"but":[34],"not":[35],"robustly":[36],"so:":[37],"they":[38],"are":[39,61],"vulnerable":[40],"to":[41,58,104,127],"adversarial":[42],"prompting":[43],"easily":[45],"undone":[46],"by":[47],"fine-tuning-based":[48],"relearning":[49,146],"attacks.":[50],"Indeed,":[51],"research":[52],"has":[53],"shown":[54],"that":[55,66,114,138],"these":[56],"edits":[57,126],"model":[60,133],"superficial:":[62],"linear":[63],"probing":[64],"reveals":[65,113],"\"directions\"":[68],"remain":[69],"present":[70],"in":[71],"representations.":[72],"To":[73],"address":[74],"this,":[75],"we":[76,97],"propose":[77],"Representation":[78],"Erasure-based":[79],"Preference":[80],"Optimization":[81],"(REPO),":[82],"reformulating":[83],"detoxification":[84],"as":[85],"a":[86,91],"token-level":[87],"preference":[88,95],"problem.":[89],"Using":[90],"novel":[92],"objective":[93],"with":[94],"data,":[96],"force":[98],"representations":[100],"continuations":[103],"converge":[105],"toward":[106],"their":[107],"benign":[108],"counterparts.":[109],"Our":[110],"mechanistic":[111],"analysis":[112],"this":[115],"granular":[116],"approach":[117],"is":[118],"critical:":[119],"unlike":[120],"baselines,":[121],"REPO":[122,139],"induces":[123],"deep,":[124],"localized":[125],"toxicity-encoding":[128],"neurons":[129],"while":[130],"preserving":[131],"general":[132],"utility.":[134],"Exhaustive":[135],"evaluations":[136],"show":[137],"achieves":[140],"state-of-the-art":[141],"robustness,":[142],"stopping":[143],"sophisticated":[144],"threats-including":[145],"attacks":[147],"enhanced":[149],"GCG":[150],"jailbreaks-where":[151],"existing":[152],"representation-":[153],"output-based":[155],"methods":[156],"fail.":[157]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-03-03T00:00:00"}
