{"id":"https://openalex.org/W4415271217","doi":"https://doi.org/10.1007/s11704-024-41099-x","title":"The gains do not make up for the losses: a comprehensive evaluation for safety alignment of large language models via machine unlearning","display_name":"The gains do not make up for the losses: a comprehensive evaluation for safety alignment of large language models via machine unlearning","publication_year":2025,"publication_date":"2025-10-17","ids":{"openalex":"https://openalex.org/W4415271217","doi":"https://doi.org/10.1007/s11704-024-41099-x"},"language":"en","primary_location":{"id":"doi:10.1007/s11704-024-41099-x","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11704-024-41099-x","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11704-024-41099-x.pdf","source":{"id":"https://openalex.org/S4210231404","display_name":"Frontiers of Computer Science","issn_l":"2095-2228","issn":["2095-2228","2095-2236"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310311889","host_organization_name":"Higher Education Press","host_organization_lineage":["https://openalex.org/P4310311889"],"host_organization_lineage_names":["Higher Education Press"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers of Computer Science","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/s11704-024-41099-x.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5065747088","display_name":"Weixiang Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Weixiang Zhao","raw_affiliation_strings":["Department of Computing, Harbin Institute of Technology, Harbin, 150006, China"],"affiliations":[{"raw_affiliation_string":"Department of Computing, Harbin Institute of Technology, Harbin, 150006, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075261261","display_name":"Yiding Hu","orcid":null},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yulin Hu","raw_affiliation_strings":["Department of Computing, Harbin Institute of Technology, Harbin, 150006, China"],"affiliations":[{"raw_affiliation_string":"Department of Computing, Harbin Institute of Technology, Harbin, 150006, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088873713","display_name":"Xin Sui","orcid":"https://orcid.org/0000-0002-6830-7012"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingyu Sui","raw_affiliation_strings":["Department of Computing, Harbin Institute of Technology, Harbin, 150006, China"],"affiliations":[{"raw_affiliation_string":"Department of Computing, Harbin Institute of Technology, Harbin, 150006, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001065753","display_name":"Zhuojun Li","orcid":null},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhuojun Li","raw_affiliation_strings":["Department of Computing, Harbin Institute of Technology, Harbin, 150006, China"],"affiliations":[{"raw_affiliation_string":"Department of Computing, Harbin Institute of Technology, Harbin, 150006, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104108250","display_name":"Yang Deng","orcid":"https://orcid.org/0009-0001-3297-2926"},"institutions":[{"id":"https://openalex.org/I79891267","display_name":"Singapore Management University","ror":"https://ror.org/050qmg959","country_code":"SG","type":"education","lineage":["https://openalex.org/I79891267"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Yang Deng","raw_affiliation_strings":["School of Computing and Information Science, Singapore Management University, Singapore, 178902, Singapore"],"affiliations":[{"raw_affiliation_string":"School of Computing and Information Science, Singapore Management University, Singapore, 178902, Singapore","institution_ids":["https://openalex.org/I79891267"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009440841","display_name":"Yanyan Zhao","orcid":"https://orcid.org/0000-0001-7446-7330"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanyan Zhao","raw_affiliation_strings":["Department of Computing, Harbin Institute of Technology, Harbin, 150006, China"],"affiliations":[{"raw_affiliation_string":"Department of Computing, Harbin Institute of Technology, Harbin, 150006, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102816371","display_name":"Bing Qin","orcid":"https://orcid.org/0009-0007-1481-9630"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bing Qin","raw_affiliation_strings":["Department of Computing, Harbin Institute of Technology, Harbin, 150006, China"],"affiliations":[{"raw_affiliation_string":"Department of Computing, Harbin Institute of Technology, Harbin, 150006, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5019108029","display_name":"Wanxiang Che","orcid":"https://orcid.org/0000-0002-3907-0335"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wanxiang Che","raw_affiliation_strings":["Department of Computing, Harbin Institute of Technology, Harbin, 150006, China"],"affiliations":[{"raw_affiliation_string":"Department of Computing, Harbin Institute of Technology, Harbin, 150006, China","institution_ids":["https://openalex.org/I204983213"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5065747088"],"corresponding_institution_ids":["https://openalex.org/I204983213"],"apc_list":{"value":2290,"currency":"EUR","value_usd":2890},"apc_paid":{"value":2290,"currency":"EUR","value_usd":2890},"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.15897497,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"20","issue":"2","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9781000018119812,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9678000211715698,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5673999786376953},{"id":"https://openalex.org/keywords/forgetting","display_name":"Forgetting","score":0.5669999718666077},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5472999811172485},{"id":"https://openalex.org/keywords/trilemma","display_name":"Trilemma","score":0.3068000078201294},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.2962000072002411}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7821999788284302},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5673999786376953},{"id":"https://openalex.org/C7149132","wikidata":"https://www.wikidata.org/wiki/Q1377840","display_name":"Forgetting","level":2,"score":0.5669999718666077},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5472999811172485},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.48660001158714294},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3345000147819519},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.31869998574256897},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3165000081062317},{"id":"https://openalex.org/C2780676692","wikidata":"https://www.wikidata.org/wiki/Q1073549","display_name":"Trilemma","level":3,"score":0.3068000078201294},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.2962000072002411},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.27149999141693115},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.2547999918460846},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.2547000050544739}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/s11704-024-41099-x","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11704-024-41099-x","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11704-024-41099-x.pdf","source":{"id":"https://openalex.org/S4210231404","display_name":"Frontiers of Computer Science","issn_l":"2095-2228","issn":["2095-2228","2095-2236"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310311889","host_organization_name":"Higher Education Press","host_organization_lineage":["https://openalex.org/P4310311889"],"host_organization_lineage_names":["Higher Education Press"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers of Computer Science","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1007/s11704-024-41099-x","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11704-024-41099-x","pdf_url":"https://link.springer.com/content/pdf/10.1007/s11704-024-41099-x.pdf","source":{"id":"https://openalex.org/S4210231404","display_name":"Frontiers of Computer Science","issn_l":"2095-2228","issn":["2095-2228","2095-2236"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310311889","host_organization_name":"Higher Education Press","host_organization_lineage":["https://openalex.org/P4310311889"],"host_organization_lineage_names":["Higher Education Press"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers of Computer Science","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4415271217.pdf"},"referenced_works_count":18,"referenced_works":["https://openalex.org/W128638292","https://openalex.org/W1583837637","https://openalex.org/W2560647685","https://openalex.org/W2602856279","https://openalex.org/W2610930722","https://openalex.org/W2620949368","https://openalex.org/W2750779823","https://openalex.org/W2912083425","https://openalex.org/W2914304175","https://openalex.org/W2946609015","https://openalex.org/W2979826702","https://openalex.org/W3194676777","https://openalex.org/W4389524330","https://openalex.org/W4389617257","https://openalex.org/W4402404817","https://openalex.org/W4402667010","https://openalex.org/W4402670860","https://openalex.org/W4410609100"],"related_works":[],"abstract_inverted_index":{"Abstract":[0],"Machine":[1],"Unlearning":[2],"(MU)":[3],"has":[4],"emerged":[5],"as":[6,165],"a":[7,82,141,166],"promising":[8],"technique":[9],"for":[10,157,174],"aligning":[11],"large":[12],"language":[13],"models":[14],"(LLMs)":[15],"with":[16,89,101,133],"safety":[17,44,98,145,175],"requirements":[18],"to":[19,53,67],"steer":[20],"them":[21],"forgetting":[22],"specific":[23],"harmful":[24,104],"contents.":[25],"Despite":[26],"the":[27,36,56,97],"significant":[28],"progress":[29],"in":[30,144],"previous":[31],"studies,":[32],"we":[33,65,113],"argue":[34],"that":[35,151],"current":[37],"evaluation":[38],"criteria,":[39],"which":[40],"solely":[41],"focus":[42],"on":[43,121,129,172],"evaluation,":[45],"are":[46,127],"actually":[47],"impractical":[48],"and":[49,78,106,123],"biased":[50],",":[51],"leading":[52],"concerns":[54],"about":[55],"true":[57],"effectiveness":[58],"of":[59,109,177],"MU":[60,72,116,136,173],"techniques.":[61],"To":[62],"address":[63],"this,":[64],"propose":[66],"comprehensively":[68],"evaluate":[69],"LLMs":[70,132],"after":[71],"from":[73],"three":[74],"aspects:":[75],"safety,":[76],"over-safety,":[77],"general":[79],"utility.":[80],"Specifically,":[81],"novel":[83],"benchmark":[84],"M":[85,160],"u":[86,161],"B":[87,162],"ench":[88,163],"18":[90],"related":[91],"datasets":[92],"is":[93,99,153],"first":[94],"constructed,":[95],"where":[96],"measured":[100],"both":[102],"vanilla":[103],"inputs":[105],"10":[107],"types":[108],"jailbreak":[110],"attacks.":[111],"Furthermore,":[112],"examine":[114],"whether":[115],"introduces":[117],"side":[118,148],"effects,":[119,149],"focusing":[120],"over-safety":[122],"utility-loss.":[124],"Extensive":[125],"experiments":[126],"performed":[128],"3":[130],"popular":[131],"7":[134],"recent":[135],"methods.":[137],"The":[138],"results":[139],"highlight":[140],"challenging":[142],"trilemma":[143],"alignment":[146,176],"without":[147],"indicating":[150],"there":[152],"still":[154],"considerable":[155],"room":[156],"further":[158],"exploration.":[159],"serves":[164],"comprehensive":[167],"benchmark,":[168],"fostering":[169],"future":[170],"research":[171],"LLMs.":[178]},"counts_by_year":[],"updated_date":"2026-03-11T06:11:40.159057","created_date":"2025-10-17T00:00:00"}
