{"id":"https://openalex.org/W7140204093","doi":"https://doi.org/10.48550/arxiv.2603.21396","title":"Mechanisms of Introspective Awareness","display_name":"Mechanisms of Introspective Awareness","publication_year":2026,"publication_date":"2026-03-22","ids":{"openalex":"https://openalex.org/W7140204093","doi":"https://doi.org/10.48550/arxiv.2603.21396"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.21396","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21396","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.21396","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Macar, Uzay","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Macar, Uzay","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yang, Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Li","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Atticus","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Atticus","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wallich, Peter","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wallich, Peter","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Ameisen, Emmanuel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ameisen, Emmanuel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Lindsey, Jack","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lindsey, Jack","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.5442000031471252,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.5442000031471252,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.15729999542236328,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.06109999865293503,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/introspection","display_name":"Introspection","score":0.9387999773025513},{"id":"https://openalex.org/keywords/false-positive-paradox","display_name":"False positive paradox","score":0.5954999923706055},{"id":"https://openalex.org/keywords/anomaly-detection","display_name":"Anomaly detection","score":0.5572999715805054},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.4586000144481659},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.3840999901294708},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.37380000948905945}],"concepts":[{"id":"https://openalex.org/C129671850","wikidata":"https://www.wikidata.org/wiki/Q210501","display_name":"Introspection","level":2,"score":0.9387999773025513},{"id":"https://openalex.org/C64869954","wikidata":"https://www.wikidata.org/wiki/Q1859747","display_name":"False positive paradox","level":2,"score":0.5954999923706055},{"id":"https://openalex.org/C739882","wikidata":"https://www.wikidata.org/wiki/Q3560506","display_name":"Anomaly detection","level":2,"score":0.5572999715805054},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.5515999794006348},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.5371999740600586},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5281000137329102},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.4586000144481659},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42800000309944153},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.3840999901294708},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.37380000948905945},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.3562000095844269},{"id":"https://openalex.org/C50335755","wikidata":"https://www.wikidata.org/wiki/Q483247","display_name":"Phenomenon","level":2,"score":0.35409998893737793},{"id":"https://openalex.org/C12997251","wikidata":"https://www.wikidata.org/wiki/Q567560","display_name":"Anomaly (physics)","level":2,"score":0.3334999978542328},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.3075999915599823},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.30379998683929443},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.274399995803833},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.26570001244544983},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.2599000036716461},{"id":"https://openalex.org/C11671645","wikidata":"https://www.wikidata.org/wiki/Q5054567","display_name":"Causal model","level":2,"score":0.25189998745918274}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.21396","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21396","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.21396","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21396","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"work":[1],"has":[2],"shown":[3],"that":[4,42,74,92,138,166,178,212],"LLMs":[5],"can":[6,80],"sometimes":[7],"detect":[8,48,128],"when":[9],"steering":[10,50,103],"vectors":[11,51,104],"are":[12],"injected":[13,21,49,158,217],"into":[14],"their":[15],"residual":[16],"stream":[17],"and":[18,62,105,151,191,221,224],"identify":[19],"the":[20,31,112],"concept":[22],"--":[23],"a":[24,116,140,192],"phenomenon":[25],"termed":[26],"\"introspective":[27],"awareness.\"":[28],"We":[29,89,110],"investigate":[30],"mechanisms":[32,165],"underlying":[33],"this":[34,66,213],"capability":[35,67,180],"in":[36,119,124,148,173,229],"open-weights":[37],"models.":[38,231],"First,":[39],"we":[40,72,176],"find":[41],"it":[43,197],"is":[44,146,181,219],"behaviorally":[45],"robust:":[46],"models":[47,150],"at":[52],"moderate":[53],"rates":[54],"with":[55,170],"0%":[56],"false":[57,207],"positives":[58],"across":[59],"diverse":[60,132],"prompts":[61],"dialogue":[63],"formats.":[64],"Notably,":[65],"emerges":[68],"specifically":[69],"from":[70],"post-training;":[71],"show":[73,177],"preference":[75],"optimization":[76],"algorithms":[77],"like":[78],"DPO":[79],"elicit":[81],"it,":[82],"but":[83],"standard":[84],"supervised":[85],"finetuning":[86],"does":[87],"not.":[88],"provide":[90],"evidence":[91],"detection":[93,113,188],"cannot":[94],"be":[95,226],"explained":[96],"by":[97,189,198],"simple":[98],"linear":[99],"association":[100],"between":[101],"certain":[102],"directions":[106,186],"promoting":[107],"affirmative":[108],"responses.":[109],"trace":[111],"mechanism":[114],"to":[115,153],"two-stage":[117],"circuit":[118,145],"which":[120],"\"evidence":[121],"carrier\"":[122],"features":[123,137],"early":[125],"post-injection":[126],"layers":[127],"perturbations":[129],"monotonically":[130],"along":[131],"directions,":[133],"suppressing":[134],"downstream":[135],"\"gate\"":[136],"implement":[139],"default":[141],"negative":[142],"response.":[143],"This":[144],"absent":[147],"base":[149],"robust":[152,220],"refusal":[154,185],"ablation.":[155],"Identification":[156],"of":[157,216],"concepts":[159,218],"relies":[160],"on":[161,200],"largely":[162],"distinct":[163],"later-layer":[164],"only":[167],"weakly":[168],"overlap":[169],"those":[171],"involved":[172],"detection.":[174],"Finally,":[175],"introspective":[179,214],"substantially":[182,227],"underelicited:":[183],"ablating":[184],"improves":[187,196],"+53%,":[190],"trained":[193],"bias":[194],"vector":[195],"+75%":[199],"held-out":[201],"concepts,":[202],"both":[203],"without":[204],"meaningfully":[205],"increasing":[206],"positives.":[208],"Our":[209],"results":[210],"suggest":[211],"awareness":[215],"mechanistically":[222],"nontrivial,":[223],"could":[225],"amplified":[228],"future":[230],"Code:":[232],"https://github.com/safety-research/introspection-mechanisms.":[233]},"counts_by_year":[],"updated_date":"2026-04-14T06:02:45.956762","created_date":"2026-03-25T00:00:00"}
