{"id":"https://openalex.org/W4412889714","doi":"https://doi.org/10.18653/v1/2025.acl-long.1479","title":"LSSF: Safety Alignment for Large Language Models through Low-Rank Safety Subspace Fusion","display_name":"LSSF: Safety Alignment for Large Language Models through Low-Rank Safety Subspace Fusion","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4412889714","doi":"https://doi.org/10.18653/v1/2025.acl-long.1479"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2025.acl-long.1479","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-long.1479","pdf_url":"https://aclanthology.org/2025.acl-long.1479.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.acl-long.1479.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5069373475","display_name":"Guanghao Zhou","orcid":"https://orcid.org/0009-0006-4549-4800"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guanghao Zhou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114337382","display_name":"Panjia Qiu","orcid":"https://orcid.org/0009-0008-4788-8998"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Panjia Qiu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101401677","display_name":"Cen Chen","orcid":"https://orcid.org/0000-0002-7210-8892"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cen Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100402310","display_name":"Hongyu Li","orcid":"https://orcid.org/0000-0003-1184-4666"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hongyu Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048132482","display_name":"Jason S. Chu","orcid":"https://orcid.org/0000-0002-8111-7977"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jason Chu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5117515649","display_name":"Xin Zhang","orcid":"https://orcid.org/0009-0003-3706-9044"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xin Zhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101917144","display_name":"Jun Zhou","orcid":"https://orcid.org/0000-0001-9352-9584"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jun Zhou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.17625971,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"30621","last_page":"30638"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12423","display_name":"Software Reliability and Analysis Research","score":0.9674000144004822,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12423","display_name":"Software Reliability and Analysis Research","score":0.9674000144004822,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11357","display_name":"Risk and Safety Analysis","score":0.9542999863624573,"subfield":{"id":"https://openalex.org/subfields/1804","display_name":"Statistics, Probability and Uncertainty"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6164727807044983},{"id":"https://openalex.org/keywords/rank","display_name":"Rank (graph theory)","score":0.5959282517433167},{"id":"https://openalex.org/keywords/subspace-topology","display_name":"Subspace topology","score":0.5456730127334595},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.32151493430137634},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3156241178512573},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.2197946310043335},{"id":"https://openalex.org/keywords/combinatorics","display_name":"Combinatorics","score":0.053726255893707275}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6164727807044983},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.5959282517433167},{"id":"https://openalex.org/C32834561","wikidata":"https://www.wikidata.org/wiki/Q660730","display_name":"Subspace topology","level":2,"score":0.5456730127334595},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32151493430137634},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3156241178512573},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2197946310043335},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.053726255893707275}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.18653/v1/2025.acl-long.1479","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-long.1479","pdf_url":"https://aclanthology.org/2025.acl-long.1479.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},{"id":"pmh:doi:10.48550/arxiv.2602.00038","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.acl-long.1479","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-long.1479","pdf_url":"https://aclanthology.org/2025.acl-long.1479.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G8914462144","display_name":null,"funder_award_id":"62202170","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320318398","display_name":"Ant Group","ror":null},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4412889714.pdf","grobid_xml":"https://content.openalex.org/works/W4412889714.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W1980381208","https://openalex.org/W2364594919","https://openalex.org/W2167092671","https://openalex.org/W1861706286","https://openalex.org/W2390279801","https://openalex.org/W2219338811","https://openalex.org/W3204019825"],"abstract_inverted_index":{"The":[0],"safety":[1,23,26,54,69,85,93,123,141,154,175,189,215],"mechanisms":[2],"of":[3,68,84,95,140,146,169,191],"large":[4],"language":[5],"models":[6,193],"(LLMs)":[7],"exhibit":[8],"notable":[9],"fragility,":[10],"as":[11],"even":[12],"fine-tuning":[13,33,106],"on":[14,31,197],"datasets":[15],"without":[16],"harmful":[17],"content":[18],"may":[19],"still":[20],"undermine":[21],"their":[22,198],"capabilities.Meanwhile,":[24],"existing":[25],"alignment":[27,124,183,190],"methods":[28],"predominantly":[29],"rely":[30],"the":[32,39,65,81,91,96,112,136,160,166,170,188,214],"process,":[34],"which":[35,98],"inadvertently":[36],"leads":[37],"to":[38,79,102,120,133],"increased":[40],"complexity":[41],"and":[42,108,163],"computational":[43],"resources":[44],"required.To":[45],"address":[46],"these":[47],"issues,":[48],"we":[49,99,148],"introduce":[50],"LSSF,":[51],"a":[52,75,150],"novel":[53,151],"re-alignment":[55],"framework":[56],"with":[57,127,194],"Low-Rank":[58],"Safety":[59],"Subspace":[60],"Fusison.Our":[61],"proposed":[62,181],"method":[63,184],"exploits":[64],"low-rank":[66,76,92],"characteristics":[67],"information":[70,142],"in":[71,200,217],"LLMs":[72,129,218],"by":[73],"constructing":[74],"projection":[77,88],"matrix":[78,89],"extract":[80],"principal":[82,116],"components":[83,117],"vectors.Notably,":[86],"this":[87],"represents":[90],"subspace":[94],"LLMs,":[97,147],"have":[100,211],"observed":[101],"remain":[103],"stable":[104],"during":[105],"process":[107],"is":[109],"isolated":[110],"from":[111],"model's":[113],"general":[114],"capabilities.These":[115],"are":[118,219],"used":[119],"effectively":[121,186],"restore":[122,187],"when":[125],"combined":[126],"fine-tuned":[128],"through":[130],"linear":[131],"arithmetic.Additionally,":[132],"account":[134],"for":[135,165,173],"varying":[137],"encoding":[138,161],"densities":[139],"across":[143],"different":[144],"layers":[145],"propose":[149],"metric":[152,158],"called":[153],"singular":[155],"value":[156],"entropy.This":[157],"quantifies":[159],"density":[162],"allows":[164],"dynamic":[167],"computation":[168],"safety-critical":[171],"rank":[172],"each":[174],"vector.Extensive":[176],"experiments":[177],"demonstrate":[178],"that":[179,213],"our":[180],"post-hoc":[182],"can":[185],"finetuned":[192],"minimal":[195],"impact":[196],"performance":[199],"downstream":[201],"tasks.Recent":[202],"studies":[203],"(Sun":[204],"et":[205,208],"al.,":[206,209],"2023;Wei":[207],"2024)":[210],"indicated":[212],"regions":[216]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
