{"id":"https://openalex.org/W7162135667","doi":"https://doi.org/10.48550/arxiv.2605.22719","title":"Reading Task Failure Off the Activations: A Sparse-Feature Audit of GPT-2 Small on Indirect Object Identification","display_name":"Reading Task Failure Off the Activations: A Sparse-Feature Audit of GPT-2 Small on Indirect Object Identification","publication_year":2026,"publication_date":"2026-05-21","ids":{"openalex":"https://openalex.org/W7162135667","doi":"https://doi.org/10.48550/arxiv.2605.22719"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.22719","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.22719","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.22719","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136798274","display_name":"Mahdi Nasermoghadasi","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Nasermoghadasi, Mahdi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":["https://openalex.org/A5136798274"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.31459999084472656,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.31459999084472656,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.1331000030040741,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.027300000190734863,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.6872000098228455},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5372999906539917},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.49079999327659607},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.48539999127388},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4641999900341034},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.4571000039577484},{"id":"https://openalex.org/keywords/logistic-regression","display_name":"Logistic regression","score":0.43290001153945923},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.42879998683929443},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.38989999890327454}],"concepts":[{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.6872000098228455},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5813000202178955},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5655999779701233},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5372999906539917},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.49079999327659607},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.48539999127388},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4641999900341034},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.4571000039577484},{"id":"https://openalex.org/C151956035","wikidata":"https://www.wikidata.org/wiki/Q1132755","display_name":"Logistic regression","level":2,"score":0.43290001153945923},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.42879998683929443},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.38989999890327454},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.38690000772476196},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3862999975681305},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.3499000072479248},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.3458000123500824},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3346000015735626},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.3163999915122986},{"id":"https://openalex.org/C554936623","wikidata":"https://www.wikidata.org/wiki/Q199657","display_name":"Reading (process)","level":2,"score":0.3089999854564667},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.30720001459121704},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.2985000014305115},{"id":"https://openalex.org/C132964779","wikidata":"https://www.wikidata.org/wiki/Q2110223","display_name":"Raw data","level":2,"score":0.2969000041484833},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.29499998688697815},{"id":"https://openalex.org/C148220186","wikidata":"https://www.wikidata.org/wiki/Q7111912","display_name":"Outcome (game theory)","level":2,"score":0.29159998893737793},{"id":"https://openalex.org/C64869954","wikidata":"https://www.wikidata.org/wiki/Q1859747","display_name":"False positive paradox","level":2,"score":0.28790000081062317},{"id":"https://openalex.org/C77052588","wikidata":"https://www.wikidata.org/wiki/Q644307","display_name":"Constant false alarm rate","level":2,"score":0.2849999964237213},{"id":"https://openalex.org/C148483581","wikidata":"https://www.wikidata.org/wiki/Q446488","display_name":"Feature selection","level":2,"score":0.2816999852657318},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.27970001101493835},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2743000090122223},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.2639000117778778},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2624000012874603},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.22719","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.22719","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.22719","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.22719","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,116,255],"report":[1],"a":[2,51,58,124,159,162,172,279],"small,":[3],"reproducible":[4],"audit":[5,240],"of":[6,11,21,37,47,70,99,144,231],"which":[7,94],"sparse-autoencoder":[8],"(SAE)":[9],"features":[10,40,191],"GPT-2":[12,31,95],"small":[13,32,96],"fire":[14],"differently":[15],"on":[16,93,104,175,278],"failed":[17],"versus":[18],"successful":[19],"trials":[20],"the":[22,38,42,86,100,105,137,145,156,176,188,193,209,225,239,257,259,262,266,272],"Indirect":[23],"Object":[24],"Identification":[25],"(IOI)":[26],"task.":[27],"On":[28],"300":[29],"prompts,":[30],"reaches":[33,181],"79.7%":[34],"accuracy;":[35],"146":[36],"24,576":[39],"in":[41,136,214,228],"layer-8":[43],"residual-stream":[44],"SAE":[45,190,194],"release":[46,256],"Bloom":[48],"(2024)":[49],"clear":[50],"Holm-corrected":[52],"significance":[53],"threshold":[54],"and":[55,268,271],"105":[56],"reach":[57],"large":[59],"effect":[60,218],"size":[61],"(|Cohen's":[62],"d|":[63],"&gt;":[64],"0.8).":[65],"The":[66,234,274],"strongest":[67],"single":[68,250],"correlate":[69,119],"failure":[71,211],"--":[72,80],"feature":[73,134,157,222,227,251],"17,491,":[74],"d=+2.93,":[75],"Neuronpedia":[76],"label":[77],"'cryptographic":[78],"keys'":[79],"is":[81,90,158,219,224,237],"essentially":[82],"silent":[83],"except":[84],"when":[85],"prompt's":[87],"transferred":[88],"object":[89],"'the":[91],"keys,'":[92],"fails":[97],"93.3%":[98],"time":[101],"vs.":[102],"7.5%":[103],"other":[106],"seven":[107],"objects":[108],"(Fisher":[109],"exact":[110],"p":[111],"=":[112,185],"8.79":[113],"x":[114],"10^-33).":[115],"put":[117],"this":[118,166],"through":[120,253],"three":[121],"controls":[122],"that":[123],"mechanistic":[125],"claim":[126],"should":[127],"pass.":[128],"(i)":[129],"A":[130,169,202],"causal":[131],"ablation:":[132],"zeroing":[133],"17,491":[135,223],"residual":[138,179],"stream":[139,180],"across":[140,205],"all":[141],"token":[142],"positions":[143],"45":[146],"keys":[147],"prompts":[148],"does":[149],"not":[150,161,198],"restore":[151],"accuracy":[152],"(6.7%":[153],"-&gt;":[154],"4.4%);":[155],"correlate,":[160],"sufficient":[163],"cause":[164],"at":[165],"layer.":[167],"(ii)":[168],"representation":[170],"baseline:":[171],"logistic":[173],"regression":[174],"raw":[177],"768-dimensional":[178],"5-fold":[182],"ROC":[183],"AUC":[184],"0.929,":[186],"matching":[187],"top-100":[189],"(0.927);":[192],"basis":[195],"adds":[196],"interpretability,":[197],"predictive":[199],"power.":[200],"(iii)":[201],"seed-robustness":[203],"check:":[204],"five":[206],"random":[207],"seeds":[208],"keys-subset":[210],"rate":[212],"stays":[213],"75.0--93.3%":[215],"(the":[216],"behavioural":[217],"real),":[220],"but":[221],"top-|d|":[226],"only":[229],"1":[230],"5":[232],"runs.":[233],"methodological":[235],"contribution":[236],"therefore":[238],"pipeline":[241,276],"(cheap,":[242],"model-agnostic,":[243],"surfaces":[244],"named":[245],"correlates)":[246],"rather":[247],"than":[248],"any":[249],"found":[252],"it.":[254],"code,":[258],"300-prompt":[260],"corpus,":[261],"300x24,576":[263],"activation":[264],"matrix,":[265],"ablation":[267],"baseline":[269],"scripts,":[270],"figures.":[273],"full":[275],"runs":[277],"laptop":[280],"(Apple":[281],"M3":[282],"Max,":[283],"no":[284],"discrete":[285],"GPU).":[286]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-23T00:00:00"}
