{"id":"https://openalex.org/W7128064915","doi":"https://doi.org/10.48550/arxiv.2602.03994","title":"Bypassing the Rationale: Causal Auditing of Implicit Reasoning in Language Models","display_name":"Bypassing the Rationale: Causal Auditing of Implicit Reasoning in Language Models","publication_year":2026,"publication_date":"2026-02-03","ids":{"openalex":"https://openalex.org/W7128064915","doi":"https://doi.org/10.48550/arxiv.2602.03994"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.03994","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125245067","display_name":"Anish Sathyanarayanan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sathyanarayanan, Anish","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120278161","display_name":"Aditya Nagarsekar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nagarsekar, Aditya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125103362","display_name":"Aarush Rathore","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rathore, Aarush","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5125245067"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.15479999780654907,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.15479999780654907,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.13420000672340393,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11883","display_name":"Embodied and Extended Cognition","score":0.06669999659061432,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/mediation","display_name":"Mediation","score":0.6452999711036682},{"id":"https://openalex.org/keywords/transparency","display_name":"Transparency (behavior)","score":0.5659999847412109},{"id":"https://openalex.org/keywords/causal-model","display_name":"Causal model","score":0.5418000221252441},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5392000079154968},{"id":"https://openalex.org/keywords/causal-reasoning","display_name":"Causal reasoning","score":0.5076000094413757},{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.45879998803138733},{"id":"https://openalex.org/keywords/comprehension","display_name":"Comprehension","score":0.387800008058548},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.376800000667572}],"concepts":[{"id":"https://openalex.org/C179420905","wikidata":"https://www.wikidata.org/wiki/Q223871","display_name":"Mediation","level":2,"score":0.6452999711036682},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6220999956130981},{"id":"https://openalex.org/C2780233690","wikidata":"https://www.wikidata.org/wiki/Q535347","display_name":"Transparency (behavior)","level":2,"score":0.5659999847412109},{"id":"https://openalex.org/C11671645","wikidata":"https://www.wikidata.org/wiki/Q5054567","display_name":"Causal model","level":2,"score":0.5418000221252441},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5392000079154968},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.5076000094413757},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.5009999871253967},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.45879998803138733},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38839998841285706},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.387800008058548},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.376800000667572},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.36570000648498535},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.3418000042438507},{"id":"https://openalex.org/C2779458634","wikidata":"https://www.wikidata.org/wiki/Q24963715","display_name":"Debiasing","level":2,"score":0.3292999863624573},{"id":"https://openalex.org/C2776325391","wikidata":"https://www.wikidata.org/wiki/Q6917865","display_name":"Motivated reasoning","level":3,"score":0.3237000107765198},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.31380000710487366},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.29179999232292175},{"id":"https://openalex.org/C163504300","wikidata":"https://www.wikidata.org/wiki/Q2364925","display_name":"Causal structure","level":2,"score":0.2915000021457672},{"id":"https://openalex.org/C21963081","wikidata":"https://www.wikidata.org/wiki/Q11337567","display_name":"Working memory","level":3,"score":0.273499995470047},{"id":"https://openalex.org/C79897977","wikidata":"https://www.wikidata.org/wiki/Q5054568","display_name":"Causal chain","level":2,"score":0.2694999873638153},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.2606000006198883},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.2563999891281128},{"id":"https://openalex.org/C2776035688","wikidata":"https://www.wikidata.org/wiki/Q1606558","display_name":"Affect (linguistics)","level":2,"score":0.2535000145435333}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.03994","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.03994","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.03994","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.03994","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.48669925332069397,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Chain-of-thought":[0],"(CoT)":[1],"prompting":[2],"is":[3,11,103,117],"widely":[4],"used":[5],"as":[6,14,182],"a":[7,15,53,183],"reasoning":[8,36,131],"aid":[9],"and":[10,96,110,136,166,168],"often":[12],"treated":[13],"transparency":[16,184],"mechanism.":[17],"Yet":[18],"behavioral":[19],"gains":[20],"under":[21],"CoT":[22,58,68,121,160,181],"do":[23],"not":[24],"imply":[25],"that":[26,100,126,159],"the":[27,34,67],"model's":[28],"internal":[29],"computation":[30,47],"causally":[31],"depends":[32],"on":[33,61],"emitted":[35],"text,":[37],"i.e.,":[38],"models":[39,127,146,165],"may":[40],"produce":[41],"fluent":[42],"rationales":[43],"while":[44,144],"routing":[45],"decision-critical":[46],"through":[48],"latent":[49],"pathways.":[50],"We":[51,123],"introduce":[52],"causal,":[54,176],"layerwise":[55,177],"audit":[56],"of":[57],"faithfulness":[59,161],"based":[60],"activation":[62],"patching.":[63],"Our":[64],"key":[65],"metric,":[66],"Mediation":[69],"Index":[70],"(CMI),":[71],"isolates":[72],"CoT-specific":[73,101],"causal":[74],"influence":[75,102],"by":[76],"comparing":[77],"performance":[78],"degradation":[79],"from":[80,172],"patching":[81],"CoT-token":[82],"hidden":[83],"states":[84],"against":[85],"matched":[86],"control":[87],"patches.":[88],"Across":[89],"multiple":[90],"model":[91],"families":[92],"(Phi,":[93],"Qwen,":[94],"DialoGPT)":[95],"scales,":[97],"we":[98,111],"find":[99],"typically":[104],"depth-localized":[105],"into":[106],"narrow":[107],"\"reasoning":[108],"windows,\"":[109],"identify":[112],"bypass":[113],"regimes":[114],"where":[115],"CMI":[116],"near-zero":[118],"despite":[119],"plausible":[120],"text.":[122],"further":[124],"observe":[125],"tuned":[128],"explicitly":[129],"for":[130],"tend":[132],"to":[133],"exhibit":[134],"stronger":[135],"more":[137,148],"structured":[138],"mediation":[139,150],"than":[140],"larger":[141],"untuned":[142],"counterparts,":[143],"Mixture-of-Experts":[145],"show":[147,158],"distributed":[149],"consistent":[151],"with":[152],"routing-based":[153],"computation.":[154],"Overall,":[155],"our":[156],"results":[157],"varies":[162],"substantially":[163],"across":[164],"tasks":[167],"cannot":[169],"be":[170],"inferred":[171],"behavior":[173],"alone,":[174],"motivating":[175],"audits":[178],"when":[179],"using":[180],"signal.":[185]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-07T00:00:00"}
