{"id":"https://openalex.org/W4367628394","doi":"https://doi.org/10.48550/arxiv.2304.14997","title":"Towards Automated Circuit Discovery for Mechanistic Interpretability","display_name":"Towards Automated Circuit Discovery for Mechanistic Interpretability","publication_year":2023,"publication_date":"2023-04-28","ids":{"openalex":"https://openalex.org/W4367628394","doi":"https://doi.org/10.48550/arxiv.2304.14997"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2304.14997","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2304.14997","pdf_url":"https://arxiv.org/pdf/2304.14997","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2304.14997","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5071070679","display_name":"Arthur Conmy","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Conmy, Arthur","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026671424","display_name":"Augustine N. Mavor-Parker","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mavor-Parker, Augustine N.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073432009","display_name":"Aengus Lynch","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lynch, Aengus","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008499905","display_name":"Stefan Heimersheim","orcid":"https://orcid.org/0000-0001-9631-4212"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Heimersheim, Stefan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5044945258","display_name":"Adri\u00e0 Garriga-Alonso","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Garriga-Alonso, Adri\u00e0","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5071070679"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":31,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.9846000075340271,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.9846000075340271,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9829000234603882,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10876","display_name":"Fault Detection and Control Systems","score":0.9315999746322632,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.9716349840164185},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7421250343322754},{"id":"https://openalex.org/keywords/intuition","display_name":"Intuition","score":0.5860398411750793},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5462954640388489},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.457806795835495},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4555354118347168},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4464063346385956},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.43334463238716125},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.4329875409603119},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.4114648699760437},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.3254506289958954},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.12481024861335754},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.08112388849258423}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.9716349840164185},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7421250343322754},{"id":"https://openalex.org/C132010649","wikidata":"https://www.wikidata.org/wiki/Q189222","display_name":"Intuition","level":2,"score":0.5860398411750793},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5462954640388489},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.457806795835495},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4555354118347168},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4464063346385956},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.43334463238716125},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.4329875409603119},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.4114648699760437},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3254506289958954},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.12481024861335754},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.08112388849258423},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2304.14997","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2304.14997","pdf_url":"https://arxiv.org/pdf/2304.14997","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2304.14997","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2304.14997","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2304.14997","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2304.14997","pdf_url":"https://arxiv.org/pdf/2304.14997","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4367628394.pdf","grobid_xml":"https://content.openalex.org/works/W4367628394.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2905433371","https://openalex.org/W4390569940","https://openalex.org/W2888392564","https://openalex.org/W4361193272","https://openalex.org/W4310278675","https://openalex.org/W4388422664","https://openalex.org/W2806259446","https://openalex.org/W2963326959","https://openalex.org/W4312407344","https://openalex.org/W4388685194"],"abstract_inverted_index":{"Through":[0],"considerable":[1],"effort":[2],"and":[3,29,59,96],"intuition,":[4],"several":[5,94],"recent":[6],"works":[7],"have":[8],"reverse-engineered":[9],"nontrivial":[10],"behaviors":[11],"of":[12,68,74,111,129,137],"transformer":[13],"models.":[14],"This":[15],"paper":[16],"systematizes":[17],"the":[18,33,52,56,66,75,80,84,88,106,112,123,130],"mechanistic":[19],"interpretability":[20,99],"process":[21],"they":[22,38],"followed.":[23],"First,":[24],"researchers":[25,63],"choose":[26],"a":[27,116],"metric":[28],"dataset":[30],"that":[31,82,121],"elicit":[32],"desired":[34],"model":[35],"behavior.":[36,53],"Then,":[37],"apply":[39],"activation":[40],"patching":[41],"to":[42,78,101],"find":[43],"which":[44,138],"abstract":[45],"neural":[46],"network":[47],"units":[48,60],"are":[49],"involved":[50],"in":[51,87,115,118,133],"By":[54],"varying":[55],"dataset,":[57],"metric,":[58],"under":[61],"investigation,":[62],"can":[64],"understand":[65],"functionality":[67],"each":[69],"component.":[70],"We":[71,92],"automate":[72],"one":[73],"process'":[76],"steps:":[77],"identify":[79],"circuit":[81,117],"implements":[83],"specified":[85],"behavior":[86],"model's":[89],"computational":[90],"graph.":[91],"propose":[93],"algorithms":[95],"reproduce":[97],"previous":[98,143],"results":[100],"validate":[102],"them.":[103],"For":[104],"example,":[105],"ACDC":[107,126],"algorithm":[108],"rediscovered":[109],"5/5":[110],"component":[113],"types":[114],"GPT-2":[119,134],"Small":[120],"computes":[122],"Greater-Than":[124],"operation.":[125],"selected":[127],"68":[128],"32,000":[131],"edges":[132],"Small,":[135],"all":[136],"were":[139],"manually":[140],"found":[141],"by":[142],"work.":[144],"Our":[145],"code":[146],"is":[147],"available":[148],"at":[149],"https://github.com/ArthurConmy/Automatic-Circuit-Discovery.":[150]},"counts_by_year":[{"year":2025,"cited_by_count":15},{"year":2024,"cited_by_count":13},{"year":2023,"cited_by_count":3}],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2025-10-10T00:00:00"}
