{"id":"https://openalex.org/W7155061384","doi":"https://doi.org/10.48550/arxiv.2604.16812","title":"Introspection Adapters: Training LLMs to Report Their Learned Behaviors","display_name":"Introspection Adapters: Training LLMs to Report Their Learned Behaviors","publication_year":2026,"publication_date":"2026-04-18","ids":{"openalex":"https://openalex.org/W7155061384","doi":"https://doi.org/10.48550/arxiv.2604.16812"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.16812","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.16812","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.16812","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134195834","display_name":"Keshav Shenoy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shenoy, Keshav","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134133907","display_name":"Li Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134126427","display_name":"Abhay Sheshadri","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sheshadri, Abhay","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134116360","display_name":"S\u00f6ren Mindermann","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mindermann, S\u00f6ren","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134186379","display_name":"Jack Lindsey","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lindsey, Jack","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134195125","display_name":"Sam Marks","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Marks, Sam","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134125648","display_name":"Rowan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Rowan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.6358000040054321,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.6358000040054321,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12479","display_name":"Web Application Security Vulnerabilities","score":0.06790000200271606,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.05660000070929527,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.5914999842643738},{"id":"https://openalex.org/keywords/introspection","display_name":"Introspection","score":0.5430999994277954},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.4034000039100647},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.3993000090122223},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.35679998993873596},{"id":"https://openalex.org/keywords/encryption","display_name":"Encryption","score":0.32100000977516174}],"concepts":[{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.5914999842643738},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5478000044822693},{"id":"https://openalex.org/C129671850","wikidata":"https://www.wikidata.org/wiki/Q210501","display_name":"Introspection","level":2,"score":0.5430999994277954},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.439300000667572},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.4034000039100647},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.3993000090122223},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.39899998903274536},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.35679998993873596},{"id":"https://openalex.org/C75630572","wikidata":"https://www.wikidata.org/wiki/Q538904","display_name":"Applied psychology","level":1,"score":0.3564000129699707},{"id":"https://openalex.org/C148730421","wikidata":"https://www.wikidata.org/wiki/Q141090","display_name":"Encryption","level":2,"score":0.32100000977516174},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.31130000948905945},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3050000071525574},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.27459999918937683},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.27239999175071716},{"id":"https://openalex.org/C83804111","wikidata":"https://www.wikidata.org/wiki/Q1063558","display_name":"Behavioral pattern","level":2,"score":0.27079999446868896},{"id":"https://openalex.org/C177284502","wikidata":"https://www.wikidata.org/wiki/Q1005390","display_name":"Adapter (computing)","level":2,"score":0.2581000030040741},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.2549000084400177}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.16812","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.16812","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.16812","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.16812","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"When":[0],"model":[1,61,164],"developers":[2],"or":[3,17],"users":[4],"fine-tune":[5],"an":[6,88],"LLM,":[7],"this":[8,113],"can":[9,150],"induce":[10],"behaviors":[11,35,49,74,119],"that":[12,112,125,174],"are":[13,176],"unexpected,":[14],"deliberately":[15],"harmful,":[16],"hard":[18],"to":[19,26,45,102,105,139,154,184],"detect.":[20],"It":[21],"would":[22],"be":[23,152],"far":[24],"easier":[25],"audit":[27],"LLMs":[28,52],"if":[29],"they":[30],"could":[31],"simply":[32],"describe":[33],"their":[34,107],"in":[36,121,128],"natural":[37],"language.":[38],"Here,":[39],"we":[40],"study":[41],"a":[42,55,60,92,177],"scalable":[43],"approach":[44,183],"rapidly":[46],"identify":[47],"learned":[48,118],"of":[50,117,123],"many":[51],"derived":[53],"from":[54,70,132],"shared":[56],"base":[57],"LLM.":[58],"Given":[59],"$M$,":[62],"our":[63,171],"method":[64],"works":[65],"by":[66],"finetuning":[67,157],"models":[68],"$M_i$":[69,101],"$M$":[71,124],"with":[72,163],"implanted":[73,108],"$b_i$;":[75],"the":[76,99,133],"$(M_i,":[77],"b_i)$":[78],"pairs":[79],"serve":[80],"as":[81],"labeled":[82],"training":[83,167],"data.":[84],"We":[85,110],"then":[86],"train":[87],"introspection":[89],"adapter":[90,95],"(IA):":[91],"single":[93],"LoRA":[94],"jointly":[96],"trained":[97,127],"across":[98],"finetunes":[100,122],"cause":[103],"them":[104],"verbalize":[106],"behaviors.":[109,148],"find":[111],"IA":[114],"induces":[115],"self-description":[116],"even":[120],"were":[126],"very":[129],"different":[130],"ways":[131],"$M_i$.":[134],"For":[135],"example,":[136],"IAs":[137,149,175],"generalize":[138],"AuditBench,":[140],"achieving":[141],"state-of-the-art":[142],"at":[143],"identifying":[144],"explicitly":[145],"hidden":[146],"concerning":[147],"also":[151],"used":[153],"detect":[155],"encrypted":[156],"API":[158],"attacks.":[159],"They":[160],"scale":[161],"favorably":[162],"size":[165],"and":[166,180],"data":[168],"diversity.":[169],"Overall,":[170],"results":[172],"suggest":[173],"scalable,":[178],"effective,":[179],"practically":[181],"useful":[182],"auditing":[185],"fine-tuned":[186],"LLMs.":[187]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-22T00:00:00"}
