{"id":"https://openalex.org/W4416549410","doi":"https://doi.org/10.1145/3719027.3765122","title":"Cascading Adversarial Bias from Injection to Distillation in Language Models","display_name":"Cascading Adversarial Bias from Injection to Distillation in Language Models","publication_year":2025,"publication_date":"2025-11-19","ids":{"openalex":"https://openalex.org/W4416549410","doi":"https://doi.org/10.1145/3719027.3765122"},"language":null,"primary_location":{"id":"doi:10.1145/3719027.3765122","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3719027.3765122","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3719027.3765122","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM SIGSAC Conference on Computer and Communications Security","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3719027.3765122","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5045507972","display_name":"Harsh Chaudhari","orcid":"https://orcid.org/0009-0000-3269-5685"},"institutions":[{"id":"https://openalex.org/I12912129","display_name":"Northeastern University","ror":"https://ror.org/04t5xt781","country_code":"US","type":"education","lineage":["https://openalex.org/I12912129"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Harsh Chaudhari","raw_affiliation_strings":["Northeastern University, Boston, USA"],"raw_orcid":"https://orcid.org/0009-0002-0430-2025","affiliations":[{"raw_affiliation_string":"Northeastern University, Boston, USA","institution_ids":["https://openalex.org/I12912129"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104083546","display_name":"Jamie Hayes","orcid":"https://orcid.org/0009-0008-5460-6290"},"institutions":[{"id":"https://openalex.org/I4210113297","display_name":"Google (United Kingdom)","ror":"https://ror.org/024bc3e07","country_code":"GB","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210113297","https://openalex.org/I4210128969"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Jamie Hayes","raw_affiliation_strings":["Google Deepmind, London, United Kingdom"],"raw_orcid":"https://orcid.org/0009-0008-5460-6290","affiliations":[{"raw_affiliation_string":"Google Deepmind, London, United Kingdom","institution_ids":["https://openalex.org/I4210113297"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054655342","display_name":"Matthew Jagielski","orcid":"https://orcid.org/0000-0002-9749-0696"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Matthew Jagielski","raw_affiliation_strings":["Google DeepMind, Boston, USA"],"raw_orcid":"https://orcid.org/0000-0002-9749-0696","affiliations":[{"raw_affiliation_string":"Google DeepMind, Boston, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069844959","display_name":"Ilia Shumailov","orcid":"https://orcid.org/0000-0003-3100-0727"},"institutions":[{"id":"https://openalex.org/I4210090411","display_name":"Google DeepMind (United Kingdom)","ror":"https://ror.org/00971b260","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210090411","https://openalex.org/I4210128969"]},{"id":"https://openalex.org/I4210113297","display_name":"Google (United Kingdom)","ror":"https://ror.org/024bc3e07","country_code":"GB","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210113297","https://openalex.org/I4210128969"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Ilia Shumailov","raw_affiliation_strings":["Google DeepMind, London, United Kingdom"],"raw_orcid":"https://orcid.org/0000-0003-3100-0727","affiliations":[{"raw_affiliation_string":"Google DeepMind, London, United Kingdom","institution_ids":["https://openalex.org/I4210113297","https://openalex.org/I4210090411"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059480732","display_name":"Milad Nasr","orcid":"https://orcid.org/0000-0002-1913-6157"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Milad Nasr","raw_affiliation_strings":["Google DeepMind, Mountain View, USA"],"raw_orcid":"https://orcid.org/0000-0002-1913-6157","affiliations":[{"raw_affiliation_string":"Google DeepMind, Mountain View, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5035574749","display_name":"Alina Oprea","orcid":"https://orcid.org/0000-0002-4979-5292"},"institutions":[{"id":"https://openalex.org/I12912129","display_name":"Northeastern University","ror":"https://ror.org/04t5xt781","country_code":"US","type":"education","lineage":["https://openalex.org/I12912129"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Alina Oprea","raw_affiliation_strings":["Northeastern University, Boston, USA"],"raw_orcid":"https://orcid.org/0000-0002-4979-5292","affiliations":[{"raw_affiliation_string":"Northeastern University, Boston, USA","institution_ids":["https://openalex.org/I12912129"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5045507972"],"corresponding_institution_ids":["https://openalex.org/I12912129"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.1821699,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"4409","last_page":"4422"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.8475000262260437,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.8475000262260437,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12262","display_name":"Hate Speech and Cyberbullying Detection","score":0.039500001817941666,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.025800000876188278,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.8345000147819519},{"id":"https://openalex.org/keywords/perplexity","display_name":"Perplexity","score":0.698199987411499},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.6761000156402588},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.42500001192092896},{"id":"https://openalex.org/keywords/resilience","display_name":"Resilience (materials science)","score":0.3993000090122223},{"id":"https://openalex.org/keywords/distillation","display_name":"Distillation","score":0.3779999911785126}],"concepts":[{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.8345000147819519},{"id":"https://openalex.org/C100279451","wikidata":"https://www.wikidata.org/wiki/Q372193","display_name":"Perplexity","level":3,"score":0.698199987411499},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.6761000156402588},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6437000036239624},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4616999924182892},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.4341000020503998},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.42500001192092896},{"id":"https://openalex.org/C2779585090","wikidata":"https://www.wikidata.org/wiki/Q3457762","display_name":"Resilience (materials science)","level":2,"score":0.3993000090122223},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.3779999911785126},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.36090001463890076},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.34209999442100525},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34040001034736633},{"id":"https://openalex.org/C199033989","wikidata":"https://www.wikidata.org/wiki/Q1318295","display_name":"Narrative","level":2,"score":0.32010000944137573},{"id":"https://openalex.org/C2778403875","wikidata":"https://www.wikidata.org/wiki/Q20312394","display_name":"Adversarial machine learning","level":3,"score":0.3091999888420105},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3003000020980835},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2639000117778778}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3719027.3765122","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3719027.3765122","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3719027.3765122","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM SIGSAC Conference on Computer and Communications Security","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3719027.3765122","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3719027.3765122","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3719027.3765122","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 ACM SIGSAC Conference on Computer and Communications Security","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G279616255","display_name":"NeTS: Medium: Resilient-by-Design Data-Driven NextG Open Radio Access Networks","funder_award_id":"2312875","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G7731344590","display_name":"SaTC: TTP: Small: Poisoning-Resilient Machine Learning Models for Threat Detection","funder_award_id":"2331081","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8952256299","display_name":null,"funder_award_id":"CNS-2312875, CNS-2331081","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416549410.pdf","grobid_xml":"https://content.openalex.org/works/W4416549410.grobid-xml"},"referenced_works_count":6,"referenced_works":["https://openalex.org/W2971307358","https://openalex.org/W3168584517","https://openalex.org/W3176580738","https://openalex.org/W4285210581","https://openalex.org/W4288057743","https://openalex.org/W4385572845"],"related_works":[],"abstract_inverted_index":{"Model":[0],"distillation":[1,127],"has":[2],"become":[3],"essential":[4],"for":[5,153],"creating":[6],"deployable":[7],"language":[8],"models,":[9],"but":[10],"their":[11,18],"widespread":[12],"deployment":[13],"raises":[14],"concerns":[15],"about":[16,17],"resilience":[19],"to":[20,43,160],"adversarial":[21,103,156],"manipulation.":[22],"This":[23],"paper":[24],"investigates":[25],"how":[26],"adversaries":[27],"can":[28],"inject":[29],"subtle":[30],"biases":[31],"into":[32],"teacher":[33],"models":[34,82],"through":[35],"minimal":[36],"data":[37],"poisoning":[38,79],"during":[39],"training,":[40],"which":[41],"propagates":[42],"a":[44],"smaller":[45],"distilled":[46],"student":[47,81],"model":[48],"and":[49,62,129,141],"becomes":[50],"significantly":[51],"amplified.":[52],"We":[53,111,148],"identify":[54],"two":[55],"propagation":[56,99],"modes:":[57],"Untargeted":[58],"(affecting":[59],"multiple":[60],"tasks)":[61],"Targeted":[63],"(focusing":[64],"on":[65,108],"specific":[66],"task":[67],"while":[68,97],"maintaining":[69],"normal":[70],"behavior":[71],"elsewhere).":[72],"With":[73],"only":[74],"25":[75],"poisoned":[76],"samples":[77],"(0.25%":[78],"rate),":[80],"generate":[83],"biased":[84],"responses":[85],"76.9%":[86],"of":[87],"the":[88],"time":[89],"in":[90,95,106],"targeted":[91],"scenarios":[92],"versus":[93],"69.4%":[94],"teachers,":[96],"untargeted":[98],"shows":[100],"5.7X-29.2X":[101],"higher":[102],"bias":[104,115,138,157],"rate":[105],"students":[107],"unseen":[109],"tasks.":[110],"validate":[112],"across":[113],"six":[114],"types":[116],"(targeted":[117],"advertisement,":[118],"phishing":[119],"link,":[120],"narrative":[121],"manipulations,":[122],"insecure":[123],"coding":[124],"practices),":[125],"various":[126],"methods,":[128],"text/code":[130],"generation":[131],"modalities.":[132],"Current":[133],"defense":[134],"mechanisms\u2014including":[135],"perplexity":[136],"filtering,":[137],"detection":[139],"systems,":[140],"LLM-based":[142],"autoraters\u2014prove":[143],"inadequate":[144],"against":[145],"these":[146],"attacks.":[147],"propose":[149],"practical":[150],"design":[151],"principles":[152],"building":[154],"effective":[155],"mitigation":[158],"strategies":[159],"address":[161],"this":[162],"threat":[163],"vector.":[164]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2025-11-23T00:00:00"}
