{"id":"https://openalex.org/W7128363283","doi":"https://doi.org/10.48550/arxiv.2602.06887","title":"Plato's Form: Toward Backdoor Defense-as-a-Service for LLMs with Prototype Representations","display_name":"Plato's Form: Toward Backdoor Defense-as-a-Service for LLMs with Prototype Representations","publication_year":2026,"publication_date":"2026-02-06","ids":{"openalex":"https://openalex.org/W7128363283","doi":"https://doi.org/10.48550/arxiv.2602.06887"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.06887","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125380019","display_name":"Chen Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Chen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125417680","display_name":"Yuchen Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Yuchen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Gao, Jiaxin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Jiaxin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125399999","display_name":"Yanwen Jia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jia, Yanwen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125391468","display_name":"Xueluan Gong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gong, Xueluan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125391757","display_name":"Qian Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Qian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125409375","display_name":"Kwok-Yan Lam","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lam, Kwok-Yan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5125380019"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.7993000149726868,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.7993000149726868,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.06960000097751617,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11424","display_name":"Security and Verification in Computing","score":0.047600001096725464,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/backdoor","display_name":"Backdoor","score":0.9987000226974487},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5317000150680542},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.3986999988555908},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.3093999922275543}],"concepts":[{"id":"https://openalex.org/C2781045450","wikidata":"https://www.wikidata.org/wiki/Q254569","display_name":"Backdoor","level":2,"score":0.9987000226974487},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5879999995231628},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5317000150680542},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42579999566078186},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.3986999988555908},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.34610000252723694},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3093999922275543},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2858999967575073},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.2856000065803528},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2621999979019165}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.06887","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.06887","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.06887","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.06887","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.4768807888031006,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"(LLMs)":[3],"are":[4,20],"increasingly":[5],"deployed":[6],"in":[7,118,185,195],"security-sensitive":[8],"applications,":[9],"yet":[10],"remain":[11],"vulnerable":[12],"to":[13,22,176],"backdoor":[14,18,60,73,171,204],"attacks.":[15],"However,":[16],"existing":[17],"defenses":[19,161],"difficult":[21],"operationalize":[23],"for":[24,93],"Backdoor":[25],"Defense-as-a-Service":[26],"(BDaaS),":[27],"as":[28,132,181,183],"they":[29],"require":[30],"unrealistic":[31],"side":[32],"information":[33],"(e.g.,":[34],"downstream":[35],"clean":[36,77,196],"data,":[37],"known":[38],"triggers/targets,":[39],"or":[40],"task":[41],"domain":[42],"specifics),":[43],"and":[44,78,87,110,141,151,169,179,206],"lack":[45],"reusable,":[46],"scalable":[47],"purification":[48,61,113],"across":[49,145],"diverse":[50,164],"backdoored":[51,79],"models.":[52,210],"In":[53],"this":[54],"paper,":[55],"we":[56],"present":[57],"PROTOPURIFY,":[58],"a":[59,72,103,133,192],"framework":[62],"via":[63,97],"parameter":[64],"edits":[65],"under":[66],"minimal":[67,126],"assumptions.":[68],"PROTOPURIFY":[69,100,136,156,173,198],"first":[70],"builds":[71],"vector":[74],"pool":[75],"from":[76],"model":[80,96],"pairs,":[81],"aggregates":[82],"vectors":[83],"into":[84],"candidate":[85,92],"prototypes,":[86],"selects":[88],"the":[89,94,119],"most":[90],"aligned":[91],"target":[95],"similarity":[98],"matching.":[99],"then":[101],"identifies":[102],"boundary":[104],"layer":[105],"through":[106],"layer-wise":[107],"prototype":[108],"alignment":[109],"performs":[111],"targeted":[112],"by":[114],"suppressing":[115],"prototype-aligned":[116],"components":[117],"affected":[120],"layers,":[121],"achieving":[122],"fine-grained":[123],"mitigation":[124],"with":[125],"impact":[127],"on":[128,148,208],"benign":[129],"utility.":[130,197],"Designed":[131],"BDaaS-ready":[134],"primitive,":[135],"supports":[137],"reusability,":[138],"customizability,":[139],"interpretability,":[140],"runtime":[142],"efficiency.":[143],"Experiments":[144],"various":[146],"LLMs":[147],"both":[149],"classification":[150],"generation":[152],"tasks":[153],"show":[154],"that":[155],"consistently":[157],"outperforms":[158],"6":[159,163],"representative":[160],"against":[162,202],"attacks,":[165],"including":[166],"single-trigger,":[167],"multi-trigger,":[168],"triggerless":[170],"settings.":[172],"reduces":[174],"ASR":[175],"below":[177],"10%,":[178],"even":[180],"low":[182],"1.6%":[184],"some":[186],"cases,":[187],"while":[188],"incurring":[189],"less":[190],"than":[191],"3%":[193],"drop":[194],"further":[199],"demonstrates":[200],"robustness":[201],"adaptive":[203],"variants":[205],"stability":[207],"non-backdoored":[209]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-10T00:00:00"}
