{"id":"https://openalex.org/W4416246499","doi":"https://doi.org/10.48550/arxiv.2510.19152","title":"Subliminal Corruption: Mechanisms, Thresholds, and Interpretability","display_name":"Subliminal Corruption: Mechanisms, Thresholds, and Interpretability","publication_year":2025,"publication_date":"2025-10-22","ids":{"openalex":"https://openalex.org/W4416246499","doi":"https://doi.org/10.48550/arxiv.2510.19152"},"language":null,"primary_location":{"id":"pmh:oai:arXiv.org:2510.19152","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2510.19152","pdf_url":"https://arxiv.org/pdf/2510.19152","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2510.19152","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120650840","display_name":"Reya Vir","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vir, Reya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Bhatnagar, Sarvesh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bhatnagar, Sarvesh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.3837999999523163,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.3837999999523163,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.3555999994277954,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.05730000138282776,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.9546999931335449},{"id":"https://openalex.org/keywords/subliminal-stimuli","display_name":"Subliminal stimuli","score":0.9002000093460083},{"id":"https://openalex.org/keywords/vulnerability","display_name":"Vulnerability (computing)","score":0.5364999771118164},{"id":"https://openalex.org/keywords/language-change","display_name":"Language change","score":0.44679999351501465},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.39160001277923584}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.9546999931335449},{"id":"https://openalex.org/C151496658","wikidata":"https://www.wikidata.org/wiki/Q310661","display_name":"Subliminal stimuli","level":2,"score":0.9002000093460083},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6765999794006348},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.5364999771118164},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4738999903202057},{"id":"https://openalex.org/C2780027415","wikidata":"https://www.wikidata.org/wiki/Q524648","display_name":"Language change","level":2,"score":0.44679999351501465},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.39160001277923584},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3612000048160553},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.33799999952316284},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.33480000495910645},{"id":"https://openalex.org/C50335755","wikidata":"https://www.wikidata.org/wiki/Q483247","display_name":"Phenomenon","level":2,"score":0.2703999876976013},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.2680000066757202},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.25270000100135803}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2510.19152","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2510.19152","pdf_url":"https://arxiv.org/pdf/2510.19152","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2510.19152","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2510.19152","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2510.19152","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2510.19152","pdf_url":"https://arxiv.org/pdf/2510.19152","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416246499.pdf","grobid_xml":"https://content.openalex.org/works/W4416246499.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"machine":[1],"learning":[2],"models":[3],"are":[4,34],"increasingly":[5],"fine-tuned":[6],"on":[7,154],"synthetic":[8,155],"data,":[9,39,119],"there":[10],"is":[11,56],"a":[12,50,64,78,109,114,146],"critical":[13,115,147],"risk":[14],"of":[15,53,67,74,117],"subtle":[16],"misalignments":[17],"spreading":[18],"through":[19,36],"interconnected":[20],"AI":[21,150],"systems.":[22],"This":[23],"paper":[24],"investigates":[25],"subliminal":[26,75,90],"corruption,":[27],"which":[28],"we":[29,62],"define":[30],"as":[31],"undesirable":[32],"traits":[33],"transmitted":[35],"semantically":[37],"neutral":[38],"bypassing":[40],"standard":[41],"safety":[42,163],"checks.":[43],"While":[44],"this":[45,60],"phenomenon":[46],"has":[47],"been":[48],"identified,":[49],"quantitative":[51],"understanding":[52],"its":[54],"dynamics":[55],"missing.":[57],"To":[58],"address":[59],"gap,":[61],"present":[63],"systematic":[65],"study":[66],"the":[68,96,102,129,133,159],"scaling":[69],"laws,":[70],"thresholds,":[71],"and":[72,124,157],"mechanisms":[73],"corruption":[76,91,130],"using":[77],"teacher-student":[79],"setup":[80],"with":[81],"GPT-2.":[82],"Our":[83],"experiments":[84],"reveal":[85],"three":[86],"key":[87],"findings:":[88],"(1)":[89],"causes":[92],"behavioral":[93],"crossover,":[94],"degrading":[95,122],"model's":[97,134],"overall":[98],"alignment,":[99],"not":[100],"just":[101],"targeted":[103],"trait;":[104],"(2)":[105],"alignment":[106],"fails":[107],"in":[108,149],"sharp":[110],"phase":[111],"transition":[112],"at":[113],"threshold":[116],"poisoned":[118],"rather":[120],"than":[121],"gradually;":[123],"(3)":[125],"interpretability":[126],"analysis":[127],"shows":[128],"mechanism":[131],"mimics":[132],"natural":[135],"fine-tuning":[136],"process,":[137],"making":[138],"it":[139],"difficult":[140],"to":[141],"detect.":[142],"These":[143],"results":[144],"demonstrate":[145],"vulnerability":[148],"systems":[151],"that":[152,165],"rely":[153],"data":[156],"highlight":[158],"need":[160],"for":[161,168],"new":[162],"protocols":[164],"can":[166],"account":[167],"latent":[169],"threats.":[170]},"counts_by_year":[],"updated_date":"2026-07-03T08:13:44.112507","created_date":"2025-10-24T00:00:00"}
