{"id":"https://openalex.org/W4396814526","doi":"https://doi.org/10.48550/arxiv.2405.04156","title":"How does GPT-2 Predict Acronyms? Extracting and Understanding a Circuit via Mechanistic Interpretability","display_name":"How does GPT-2 Predict Acronyms? Extracting and Understanding a Circuit via Mechanistic Interpretability","publication_year":2024,"publication_date":"2024-05-07","ids":{"openalex":"https://openalex.org/W4396814526","doi":"https://doi.org/10.48550/arxiv.2405.04156"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2405.04156","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.04156","pdf_url":"https://arxiv.org/pdf/2405.04156","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2405.04156","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5002758019","display_name":"Jorge Garc\u00eda\u2010Carrasco","orcid":"https://orcid.org/0000-0003-3174-083X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Garc\u00eda-Carrasco, Jorge","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091436252","display_name":"Alejandro Mat\u00e9","orcid":"https://orcid.org/0000-0001-7770-3693"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mat\u00e9, Alejandro","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5087667090","display_name":"Juan Trujillo","orcid":"https://orcid.org/0000-0003-0139-6724"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Trujillo, Juan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5002758019"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.8544999957084656,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.8544999957084656,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.8352000117301941,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.779699981212616,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.959245502948761},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5512986779212952},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3329927921295166}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.959245502948761},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5512986779212952},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3329927921295166}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2405.04156","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.04156","pdf_url":"https://arxiv.org/pdf/2405.04156","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2405.04156","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2405.04156","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article-journal"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2405.04156","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.04156","pdf_url":"https://arxiv.org/pdf/2405.04156","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1354918299","display_name":null,"funder_award_id":"PROMETEO/2021/088","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"},{"id":"https://openalex.org/G276391773","display_name":null,"funder_award_id":"ACIF/2021","funder_id":"https://openalex.org/F4320321864","funder_display_name":"Generalitat Valenciana"},{"id":"https://openalex.org/G2827925112","display_name":null,"funder_award_id":"ACIF/","funder_id":"https://openalex.org/F4320321864","funder_display_name":"Generalitat Valenciana"},{"id":"https://openalex.org/G4238776159","display_name":null,"funder_award_id":"PID2020-112540RB-C4","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"},{"id":"https://openalex.org/G5036203499","display_name":null,"funder_award_id":"PID2020-112540RB-C43","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"},{"id":"https://openalex.org/G5352278183","display_name":null,"funder_award_id":"RB-C43","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"},{"id":"https://openalex.org/G5805502524","display_name":null,"funder_award_id":"PID2020","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"},{"id":"https://openalex.org/G6589649124","display_name":null,"funder_award_id":"PID2020-","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"}],"funders":[{"id":"https://openalex.org/F4320321864","display_name":"Generalitat Valenciana","ror":"https://ror.org/0097mvx21"},{"id":"https://openalex.org/F4320322930","display_name":"Ministerio de Ciencia e Innovaci\u00f3n","ror":"https://ror.org/034900433"},{"id":"https://openalex.org/F4320338357","display_name":"FP7 Science in Society","ror":null}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4396814526.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2905433371","https://openalex.org/W2888392564","https://openalex.org/W4310278675","https://openalex.org/W4388422664","https://openalex.org/W4390569940","https://openalex.org/W4361193272","https://openalex.org/W2963326959","https://openalex.org/W4388685194"],"abstract_inverted_index":{"Transformer-based":[0],"language":[1],"models":[2],"are":[3],"treated":[4],"as":[5],"black-boxes":[6],"because":[7],"of":[8,12,35,51,75,93,108,113,147],"their":[9,125],"large":[10],"number":[11],"parameters":[13],"and":[14,150],"complex":[15,177],"internal":[16],"interactions,":[17],"which":[18,117,158],"is":[19,79,102,159],"a":[20,69,88,105],"serious":[21],"safety":[22],"concern.":[23],"Mechanistic":[24],"Interpretability":[25],"(MI)":[26],"intends":[27],"to":[28,85,124,170],"reverse-engineer":[29],"neural":[30],"network":[31],"behaviors":[32,178],"in":[33,57,120],"terms":[34],"human-understandable":[36],"components.":[37],"In":[38,138],"this":[39,78,168],"work,":[40],"we":[41,118,140],"focus":[42],"on":[43,65],"understanding":[44,175],"how":[45],"GPT-2":[46],"Small":[47],"performs":[48],"the":[49,58,73,80,91,100,114,134,143,148,162,172],"task":[50],"predicting":[52],"three-letter":[53],"acronyms.":[54],"Previous":[55],"works":[56],"MI":[59],"field":[60],"have":[61],"focused":[62],"so":[63],"far":[64],"tasks":[66],"that":[67,83,99,130,153],"predict":[68],"single":[70],"token.":[71],"To":[72],"best":[74],"our":[76],"knowledge,":[77],"first":[81],"work":[82,169],"tries":[84],"mechanistically":[86,141],"understand":[87],"behavior":[89],"involving":[90,179],"prediction":[92,101,136],"multiple":[94],"consecutive":[95],"tokens.":[96],"We":[97,127,166],"discover":[98],"performed":[103],"by":[104],"circuit":[106,149],"composed":[107],"8":[109],"attention":[110],"heads":[111,132,146],"(~5%":[112],"total":[115],"heads)":[116],"classified":[119],"three":[121],"groups":[122],"according":[123],"role.":[126],"also":[128],"demonstrate":[129],"these":[131],"concentrate":[133],"acronym":[135],"functionality.":[137],"addition,":[139],"interpret":[142],"most":[144],"relevant":[145],"find":[151],"out":[152],"they":[154],"use":[155],"positional":[156],"information":[157],"propagated":[160],"via":[161],"causal":[163],"mask":[164],"mechanism.":[165],"expect":[167],"lay":[171],"foundation":[173],"for":[174],"more":[176],"multiple-token":[180],"predictions.":[181]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2024-05-11T00:00:00"}
