{"id":"https://openalex.org/W4402442681","doi":"https://doi.org/10.1145/3650212.3680304","title":"DistillSeq: A Framework for Safety Alignment Testing in Large Language Models using Knowledge Distillation","display_name":"DistillSeq: A Framework for Safety Alignment Testing in Large Language Models using Knowledge Distillation","publication_year":2024,"publication_date":"2024-09-11","ids":{"openalex":"https://openalex.org/W4402442681","doi":"https://doi.org/10.1145/3650212.3680304"},"language":"en","primary_location":{"id":"doi:10.1145/3650212.3680304","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3650212.3680304","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM SIGSOFT International Symposium on Software Testing and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5088905502","display_name":"Mingke Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Mingke Yang","raw_affiliation_strings":["ShanghaiTech University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0006-4508-350X","affiliations":[{"raw_affiliation_string":"ShanghaiTech University, Shanghai, China","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026181425","display_name":"Yuqi Chen","orcid":"https://orcid.org/0000-0003-2988-6012"},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuqi Chen","raw_affiliation_strings":["ShanghaiTech University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0003-2988-6012","affiliations":[{"raw_affiliation_string":"ShanghaiTech University, Shanghai, China","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100330541","display_name":"Yi Liu","orcid":"https://orcid.org/0000-0002-4978-127X"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Yi Liu","raw_affiliation_strings":["Nanyang Technological University, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-4978-127X","affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5085767410","display_name":"Ling Shi","orcid":"https://orcid.org/0000-0002-2023-0247"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Ling Shi","raw_affiliation_strings":["Nanyang Technological University, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-2023-0247","affiliations":[{"raw_affiliation_string":"Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5088905502"],"corresponding_institution_ids":["https://openalex.org/I30809798"],"apc_list":null,"apc_paid":null,"fwci":2.9802,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.92265515,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"578","last_page":"589"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9926000237464905,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9926000237464905,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9922000169754028,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.965399980545044,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7222068309783936},{"id":"https://openalex.org/keywords/distillation","display_name":"Distillation","score":0.6004592776298523},{"id":"https://openalex.org/keywords/reliability-engineering","display_name":"Reliability engineering","score":0.4643810987472534},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.33779823780059814},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.16968101263046265},{"id":"https://openalex.org/keywords/chemistry","display_name":"Chemistry","score":0.0857374370098114},{"id":"https://openalex.org/keywords/chromatography","display_name":"Chromatography","score":0.07662519812583923}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7222068309783936},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.6004592776298523},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.4643810987472534},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.33779823780059814},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.16968101263046265},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0857374370098114},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.07662519812583923}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3650212.3680304","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3650212.3680304","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM SIGSOFT International Symposium on Software Testing and Analysis","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":6,"referenced_works":["https://openalex.org/W2997200074","https://openalex.org/W4310415871","https://openalex.org/W4383176079","https://openalex.org/W4386021093","https://openalex.org/W4400484590","https://openalex.org/W6846250077"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W4395014643"],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"have":[4],"showcased":[5],"their":[6],"remarkable":[7],"capabilities":[8],"in":[9,232,258],"diverse":[10],"domains,":[11],"encompassing":[12],"natural":[13],"language":[14],"understanding,":[15],"translation,":[16],"and":[17,38,46,121,178,204,223,264],"even":[18],"code":[19],"generation.":[20],"The":[21],"potential":[22],"for":[23,77,110,196,199,202,206,268],"LLMs":[24,42,53,192],"to":[25,43,73,100,137,144,219,228,243],"generate":[26],"harmful":[27],"content":[28],"is":[29,71],"a":[30,101,117,132,237],"significant":[31,254],"concern.":[32],"This":[33,226],"risk":[34],"necessitates":[35],"rigorous":[36],"testing":[37,51,69,163,270],"comprehensive":[39],"evaluation":[40,79],"of":[41,52,83,170,183,212,239,248,260],"ensure":[44],"safe":[45],"responsible":[47],"use.":[48],"However,":[49,208],"extensive":[50],"requires":[54],"substantial":[55],"computational":[56],"resources,":[57],"making":[58],"it":[59],"an":[60,98,125,229],"expensive":[61],"endeavor.":[62],"Therefore,":[63],"exploring":[64],"cost-saving":[65],"strategies":[66,109],"during":[67],"the":[68,75,81,94,122,162,168,181,185,210,246,253,262],"phase":[70],"crucial":[72],"balance":[74],"need":[76],"thorough":[78],"with":[80,158],"constraints":[82],"resource":[84,265],"availability.":[85],"To":[86],"address":[87],"this,":[88],"our":[89,129],"approach":[90,130],"begins":[91],"by":[92,236],"transferring":[93],"moderation":[95],"knowledge":[96],"from":[97],"LLM":[99],"small":[102],"model.":[103],"Subsequently,":[104],"we":[105,151],"deploy":[106],"two":[107],"distinct":[108],"generating":[111],"malicious":[112],"queries:":[113],"one":[114],"based":[115],"on":[116,190],"syntax":[118],"tree":[119],"approach,":[120],"other":[123],"leveraging":[124],"LLM-based":[126],"method.":[127],"Finally,":[128],"incorporates":[131],"sequential":[133],"filter-test":[134],"process":[135],"designed":[136],"identify":[138],"test":[139],"cases":[140],"that":[141],"are":[142],"prone":[143],"eliciting":[145],"toxic":[146],"responses.":[147],"By":[148],"doing":[149],"so,":[150],"significantly":[152],"curtail":[153],"unnecessary":[154],"or":[155],"unproductive":[156],"interactions":[157],"LLMs,":[159],"thereby":[160],"streamlining":[161],"process.":[164],"Our":[165],"research":[166],"evaluated":[167],"efficacy":[169],"DistillSeq":[171,256],"across":[172],"four":[173],"LLMs:":[174],"GPT-3.5,":[175,197],"GPT-4.0,":[176,200],"Vicuna-13B,":[177,203],"Llama-13B.":[179,207],"In":[180],"absence":[182],"DistillSeq,":[184,213],"observed":[186],"attack":[187,233],"success":[188,215,234],"rates":[189,216],"these":[191,214],"stood":[193],"at":[194],"31.5%":[195],"21.4%":[198],"28.3%":[201],"30.9%":[205],"upon":[209],"application":[211],"notably":[217],"increased":[218],"58.5%,":[220],"50.7%,":[221],"52.5%,":[222],"54.4%,":[224],"respectively.":[225],"translated":[227],"average":[230],"escalation":[231],"rate":[235],"factor":[238],"93.0%":[240],"when":[241],"compared":[242],"scenarios":[244],"without":[245],"use":[247],"DistillSeq.":[249],"Such":[250],"findings":[251],"highlight":[252],"enhancement":[255],"offers":[257],"terms":[259],"reducing":[261],"time":[263],"investment":[266],"required":[267],"effectively":[269],"LLMs.":[271]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
