{"id":"https://openalex.org/W7123336998","doi":"https://doi.org/10.1109/tifs.2026.3652843","title":"Refining Positive and Toxic Samples for Dual Safety Self-Alignment of LLMs With Minimal Human Interventions","display_name":"Refining Positive and Toxic Samples for Dual Safety Self-Alignment of LLMs With Minimal Human Interventions","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7123336998","doi":"https://doi.org/10.1109/tifs.2026.3652843"},"language":"en","primary_location":{"id":"doi:10.1109/tifs.2026.3652843","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tifs.2026.3652843","pdf_url":null,"source":{"id":"https://openalex.org/S61310614","display_name":"IEEE Transactions on Information Forensics and Security","issn_l":"1556-6013","issn":["1556-6013","1556-6021"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Information Forensics and Security","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5064393625","display_name":"Junfang Xu","orcid":"https://orcid.org/0000-0001-9133-8960"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jingxin Xu","raw_affiliation_strings":["National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020360628","display_name":"Guoshun Nan","orcid":"https://orcid.org/0000-0002-1987-2736"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guoshun Nan","raw_affiliation_strings":["National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122849443","display_name":"Sheng Guan","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sheng Guan","raw_affiliation_strings":["National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051640038","display_name":"Sicong Leng","orcid":"https://orcid.org/0000-0002-3084-5026"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Sicong Leng","raw_affiliation_strings":["Nanyang Technological University, Nanyang Avenue, Singapore"],"affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Nanyang Avenue, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122862343","display_name":"Yilian Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yilian Liu","raw_affiliation_strings":["National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100742338","display_name":"Zixiao Wang","orcid":"https://orcid.org/0000-0001-7962-733X"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zixiao Wang","raw_affiliation_strings":["National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040346248","display_name":"Yuyang Ma","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuyang Ma","raw_affiliation_strings":["National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121688382","display_name":"Zhili Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I37987034","display_name":"Guangzhou University","ror":"https://ror.org/05ar8rn06","country_code":"CN","type":"education","lineage":["https://openalex.org/I37987034"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhili Zhou","raw_affiliation_strings":["Guangzhou University, Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"Guangzhou University, Guangzhou, China","institution_ids":["https://openalex.org/I37987034"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101635010","display_name":"Yanzhao Hou","orcid":"https://orcid.org/0000-0001-5571-9539"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanzhao Hou","raw_affiliation_strings":["National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5122863614","display_name":"Xiaofeng Tao","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaofeng Tao","raw_affiliation_strings":["National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Mobile Network Technologies, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5064393625"],"corresponding_institution_ids":["https://openalex.org/I139759216"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.08825735,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"21","issue":null,"first_page":"1409","last_page":"1423"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.6338000297546387,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.6338000297546387,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.11590000241994858,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.07609999924898148,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/safer","display_name":"SAFER","score":0.6948000192642212},{"id":"https://openalex.org/keywords/dual","display_name":"Dual (grammatical number)","score":0.5837000012397766},{"id":"https://openalex.org/keywords/psychological-intervention","display_name":"Psychological intervention","score":0.46050000190734863},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.4523000121116638},{"id":"https://openalex.org/keywords/refining","display_name":"Refining (metallurgy)","score":0.4410000145435333},{"id":"https://openalex.org/keywords/estimation","display_name":"Estimation","score":0.3634999990463257}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.715499997138977},{"id":"https://openalex.org/C2776654903","wikidata":"https://www.wikidata.org/wiki/Q2601463","display_name":"SAFER","level":2,"score":0.6948000192642212},{"id":"https://openalex.org/C2780980858","wikidata":"https://www.wikidata.org/wiki/Q110022","display_name":"Dual (grammatical number)","level":2,"score":0.5837000012397766},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.5608999729156494},{"id":"https://openalex.org/C27415008","wikidata":"https://www.wikidata.org/wiki/Q7256382","display_name":"Psychological intervention","level":2,"score":0.46050000190734863},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.4523000121116638},{"id":"https://openalex.org/C60044698","wikidata":"https://www.wikidata.org/wiki/Q1283324","display_name":"Refining (metallurgy)","level":2,"score":0.4410000145435333},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3707999885082245},{"id":"https://openalex.org/C96250715","wikidata":"https://www.wikidata.org/wiki/Q965330","display_name":"Estimation","level":2,"score":0.3634999990463257},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3375999927520752},{"id":"https://openalex.org/C12174686","wikidata":"https://www.wikidata.org/wiki/Q1058438","display_name":"Risk assessment","level":2,"score":0.30979999899864197},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.29179999232292175},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.2655999958515167},{"id":"https://openalex.org/C32896092","wikidata":"https://www.wikidata.org/wiki/Q189447","display_name":"Risk management","level":2,"score":0.2628999948501587}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tifs.2026.3652843","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tifs.2026.3652843","pdf_url":null,"source":{"id":"https://openalex.org/S61310614","display_name":"IEEE Transactions on Information Forensics and Security","issn_l":"1556-6013","issn":["1556-6013","1556-6021"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Information Forensics and Security","raw_type":"journal-article"},{"id":"pmh:oai:dr.ntu.edu.sg:10356/212473","is_oa":false,"landing_page_url":"https://hdl.handle.net/10356/212473","pdf_url":null,"source":{"id":"https://openalex.org/S4306402609","display_name":"DR-NTU (Nanyang Technological University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I172675005","host_organization_name":"Nanyang Technological University","host_organization_lineage":["https://openalex.org/I172675005"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Journal Article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6784811019897461,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W2781665445","https://openalex.org/W2970062726","https://openalex.org/W3002093512","https://openalex.org/W3173465197","https://openalex.org/W3201174429","https://openalex.org/W4385572248","https://openalex.org/W4385572634","https://openalex.org/W4404782827","https://openalex.org/W4405304484","https://openalex.org/W4405304667","https://openalex.org/W4409356354","https://openalex.org/W4412944723"],"related_works":[],"abstract_inverted_index":{"Recent":[0],"AI":[1],"agents,":[2],"such":[3,51],"as":[4,52,131],"ChatGPT":[5],"and":[6,13,33,55,60,108,111,144,166,233,240,264],"LLaMA,":[7],"primarily":[8],"rely":[9],"on":[10,39,193,212,244],"instruction":[11,115],"tuning":[12],"reinforcement":[14],"learning":[15],"to":[16,141,171,174,185,204,222],"calibrate":[17],"the":[18,29,40,176,187,197,202,206,216,220,235,250],"output":[19,207],"of":[20,43,189,208,237,252,262],"large":[21],"language":[22],"models":[23],"(LLMs)":[24],"with":[25,49,69],"human":[26,102,154],"intentions,":[27],"ensuring":[28],"outputs":[30],"are":[31,73,119],"harmless":[32,120,190],"helpful.":[34],"Existing":[35],"methods":[36],"heavily":[37],"depend":[38],"manual":[41],"annotation":[42],"high-quality":[44],"positive":[45,107,194],"samples,":[46],"while":[47,122,258],"contending":[48],"issues":[50],"noisy":[53],"labels":[54],"minimal":[56],"distinctions":[57,72],"between":[58],"preferred":[59],"dispreferred":[61],"response":[62],"data.":[63],"However,":[64],"readily":[65],"available":[66],"toxic":[67,109,123,213],"samples":[68,110,118,124,214],"clear":[70],"safety":[71,86,97,224,256],"often":[74],"filtered":[75],"out,":[76],"removing":[77],"valuable":[78],"negative":[79],"references":[80],"that":[81,100],"could":[82],"aid":[83],"LLMs":[84,248],"in":[85],"alignment.":[87],"In":[88],"response,":[89],"we":[90,137],"propose":[91],"Positive\u2013Toxic":[92],"Self-Alignment":[93],"(PT-ALIGN),":[94],"a":[95,132],"novel":[96],"self-alignment":[98],"approach":[99],"minimizes":[101],"supervision":[103],"by":[104,148],"automatically":[105],"refining":[106],"performing":[112],"fine-grained":[113,167,198],"dual":[114],"tuning.":[116],"Positive":[117],"responses,":[121],"deliberately":[125],"contain":[126],"extremely":[127],"harmful":[128,209],"content,":[129],"serving":[130],"new":[133],"supervisory":[134],"signal.":[135],"Specifically,":[136],"utilize":[138],"LLM":[139,184,203],"itself":[140],"iteratively":[142],"generate":[143],"refine":[145],"training":[146,169],"instances":[147],"only":[149],"exploring":[150],"fewer":[151],"than":[152],"50":[153],"annotations.":[155],"We":[156],"then":[157],"employ":[158],"two":[159],"losses,":[160],"i.e.,":[161],"maximum":[162],"likelihood":[163,236],"estimation":[164],"(MLE)":[165],"unlikelihood":[168],"(UT),":[170],"jointly":[172],"learn":[173],"enhance":[175],"LLM\u2019s":[177],"safety.":[178],"The":[179],"MLE":[180],"loss":[181,200],"encourages":[182],"an":[183],"maximize":[186],"generation":[188],"content":[191],"based":[192,211],"samples.":[195],"Conversely,":[196],"UT":[199],"guides":[201],"minimize":[205],"words":[210],"at":[215],"token-level,":[217],"thereby":[218],"guiding":[219],"model":[221],"decouple":[223],"from":[225],"effectiveness,":[226],"directing":[227],"it":[228],"toward":[229],"safer":[230],"fine-tuning":[231],"objectives,":[232],"increasing":[234],"generating":[238],"helpful":[239],"reliable":[241],"content.":[242],"Experiments":[243],"9":[245],"popular":[246],"open-source":[247],"demonstrate":[249],"effectiveness":[251],"our":[253],"PT-ALIGN":[254],"for":[255],"alignment,":[257],"maintaining":[259],"comparable":[260],"levels":[261],"helpfulness":[263],"usefulness.":[265]},"counts_by_year":[],"updated_date":"2026-04-04T08:04:53.788161","created_date":"2026-01-14T00:00:00"}
