{"id":"https://openalex.org/W7139918933","doi":"https://doi.org/10.48550/arxiv.2603.18353","title":"Interpretability without actionability: mechanistic methods cannot correct language model errors despite near-perfect internal representations","display_name":"Interpretability without actionability: mechanistic methods cannot correct language model errors despite near-perfect internal representations","publication_year":2026,"publication_date":"2026-03-18","ids":{"openalex":"https://openalex.org/W7139918933","doi":"https://doi.org/10.48550/arxiv.2603.18353"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.18353","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18353","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.18353","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130249536","display_name":"Sanjay Basu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Basu, Sanjay","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130230148","display_name":"Sadiq Y. Patel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Patel, Sadiq Y.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016472371","display_name":"Parth Sheth","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sheth, Parth","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116080641","display_name":"Bhairavi Muralidharan","orcid":"https://orcid.org/0000-0001-7613-8514"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Muralidharan, Bhairavi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119757823","display_name":"Namrata Elamaran","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Elamaran, Namrata","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092105226","display_name":"Aakriti Kinra","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kinra, Aakriti","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127929016","display_name":"John Morgan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Morgan, John","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5015814541","display_name":"Rajaie Batniji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Batniji, Rajaie","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5130249536"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.47189998626708984,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.47189998626708984,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.24639999866485596,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12574","display_name":"Clinical Reasoning and Diagnostic Skills","score":0.05719999969005585,"subfield":{"id":"https://openalex.org/subfields/2714","display_name":"Family Practice"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.9632999897003174},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.6000000238418579},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.47589999437332153},{"id":"https://openalex.org/keywords/curse-of-dimensionality","display_name":"Curse of dimensionality","score":0.3625999987125397},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.3621000051498413},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.3176000118255615},{"id":"https://openalex.org/keywords/overfitting","display_name":"Overfitting","score":0.3124000132083893}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.9632999897003174},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6022999882698059},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.6000000238418579},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5095000267028809},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.47589999437332153},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.40299999713897705},{"id":"https://openalex.org/C111030470","wikidata":"https://www.wikidata.org/wiki/Q1430460","display_name":"Curse of dimensionality","level":2,"score":0.3625999987125397},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.3621000051498413},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.3176000118255615},{"id":"https://openalex.org/C22019652","wikidata":"https://www.wikidata.org/wiki/Q331309","display_name":"Overfitting","level":3,"score":0.3124000132083893},{"id":"https://openalex.org/C21200559","wikidata":"https://www.wikidata.org/wiki/Q7451068","display_name":"Sensitivity (control systems)","level":2,"score":0.299699991941452},{"id":"https://openalex.org/C83665646","wikidata":"https://www.wikidata.org/wiki/Q42139305","display_name":"Feature vector","level":2,"score":0.27480000257492065},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.26919999718666077},{"id":"https://openalex.org/C169258074","wikidata":"https://www.wikidata.org/wiki/Q245748","display_name":"Random forest","level":2,"score":0.2574000060558319},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.25619998574256897},{"id":"https://openalex.org/C47559304","wikidata":"https://www.wikidata.org/wiki/Q1702189","display_name":"Orthogonalization","level":2,"score":0.2538999915122986}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.18353","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18353","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.18353","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.18353","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities","score":0.5963898301124573}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Language":[0],"models":[1],"encode":[2],"task-relevant":[3],"knowledge":[4,157],"in":[5],"internal":[6,156],"representations":[7],"that":[8,167],"far":[9],"exceeds":[10],"their":[11],"output":[12,89],"performance,":[13],"but":[14,106,143],"whether":[15],"mechanistic":[16,32,150],"interpretability":[17,33,151,169],"methods":[18,34,152],"can":[19],"bridge":[20],"this":[21],"knowledge-action":[22,96],"gap":[23],"has":[24],"not":[25],"been":[26],"systematically":[27],"tested.":[28],"We":[29],"compared":[30],"four":[31],"--":[35,61],"concept":[36],"bottleneck":[37,99],"steering":[38,56,100,119,128],"(Steerling-8B),":[39],"sparse":[40],"autoencoder":[41],"feature":[42,118],"steering,":[43],"logit":[44],"lens":[45],"with":[46,52,83,161],"activation":[47],"patching,":[48],"and":[49],"linear":[50],"probing":[51],"truthfulness":[53],"separator":[54],"vector":[55],"(Qwen":[57],"2.5":[58],"7B":[59],"Instruct)":[60],"for":[62,163],"correcting":[63],"false-negative":[64],"triage":[65],"errors":[66,147],"using":[67],"400":[68],"physician-adjudicated":[69],"clinical":[70],"vignettes":[71],"(144":[72],"hazards,":[73],"256":[74],"benign).":[75],"Linear":[76],"probes":[77],"discriminated":[78],"hazardous":[79],"from":[80,113],"benign":[81],"cases":[82],"98.2%":[84],"AUROC,":[85],"yet":[86],"the":[87],"model's":[88],"sensitivity":[90],"was":[91],"only":[92],"45.1%,":[93],"a":[94],"53-percentage-point":[95],"gap.":[97],"Concept":[98],"corrected":[101,132,159],"20%":[102],"of":[103,109,134,140,146],"missed":[104,135],"hazards":[105,136],"disrupted":[107],"53%":[108],"correct":[110,141],"detections,":[111,142],"indistinguishable":[112],"random":[114],"perturbation":[115],"(p=0.84).":[116],"SAE":[117],"produced":[120],"zero":[121],"effect":[122],"despite":[123],"3,695":[124],"significant":[125],"features.":[126],"TSV":[127],"at":[129],"high":[130],"strength":[131],"24%":[133],"while":[137],"disrupting":[138],"6%":[139],"left":[144],"76%":[145],"uncorrected.":[148],"Current":[149],"cannot":[153],"reliably":[154],"translate":[155],"into":[158],"outputs,":[160],"implications":[162],"AI":[164],"safety":[165],"frameworks":[166],"assume":[168],"enables":[170],"effective":[171],"error":[172],"correction.":[173]},"counts_by_year":[],"updated_date":"2026-03-21T06:36:02.116451","created_date":"2026-03-21T00:00:00"}
