{"id":"https://openalex.org/W4404304991","doi":"https://doi.org/10.48550/arxiv.2410.17438","title":"Interpreting Affine Recurrence Learning in GPT-style Transformers","display_name":"Interpreting Affine Recurrence Learning in GPT-style Transformers","publication_year":2024,"publication_date":"2024-10-22","ids":{"openalex":"https://openalex.org/W4404304991","doi":"https://doi.org/10.48550/arxiv.2410.17438"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2410.17438","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.17438","pdf_url":"https://arxiv.org/pdf/2410.17438","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2410.17438","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114617497","display_name":"Samarth Bhargav","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Bhargav, Samarth","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5114617498","display_name":"Alexander Gu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Alexander","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5114617497"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9750000238418579,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9750000238418579,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9581000208854675,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.9520999789237976,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.641586184501648},{"id":"https://openalex.org/keywords/affine-transformation","display_name":"Affine transformation","score":0.6260141134262085},{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.4854736626148224},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.39708465337753296},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.3242872357368469},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.16036897897720337},{"id":"https://openalex.org/keywords/pure-mathematics","display_name":"Pure mathematics","score":0.14293619990348816},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.10145315527915955},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.09363985061645508}],"concepts":[{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.641586184501648},{"id":"https://openalex.org/C92757383","wikidata":"https://www.wikidata.org/wiki/Q382497","display_name":"Affine transformation","level":2,"score":0.6260141134262085},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.4854736626148224},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.39708465337753296},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3242872357368469},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.16036897897720337},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.14293619990348816},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.10145315527915955},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.09363985061645508},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2410.17438","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.17438","pdf_url":"https://arxiv.org/pdf/2410.17438","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2410.17438","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2410.17438","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2410.17438","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.17438","pdf_url":"https://arxiv.org/pdf/2410.17438","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4404304991.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W4391375266","https://openalex.org/W1979597421","https://openalex.org/W2007980826","https://openalex.org/W2061531152","https://openalex.org/W3002753104","https://openalex.org/W2077600819","https://openalex.org/W2142036596","https://openalex.org/W2072657027"],"abstract_inverted_index":{"Understanding":[0],"the":[1,36,51,87,101,108,116,128,158,172],"internal":[2,89],"mechanisms":[3],"of":[4,54,107,138,160,174],"GPT-style":[5],"transformers,":[6,56],"particularly":[7],"their":[8,33,60],"capacity":[9],"to":[10,27,62,81,134,168],"perform":[11],"in-context":[12],"learning":[13,24],"(ICL),":[14],"is":[15,120],"critical":[16],"for":[17,148,163],"advancing":[18],"AI":[19,150],"alignment":[20,151],"and":[21,64,85,94,144,171],"interpretability.":[22,154],"In-context":[23],"allows":[25],"transformers":[26],"generalize":[28],"during":[29],"inference":[30],"without":[31],"modifying":[32],"weights,":[34],"yet":[35],"precise":[37],"operations":[38,90],"driving":[39],"this":[40],"capability":[41],"remain":[42],"largely":[43],"opaque.":[44],"This":[45],"paper":[46],"presents":[47],"an":[48,69,104],"investigation":[49],"into":[50],"mechanistic":[52,153],"interpretability":[53],"these":[55],"focusing":[57],"specifically":[58],"on":[59],"ability":[61],"learn":[63],"predict":[65,82],"affine":[66,83],"recurrences":[67,84,170],"as":[68],"ICL":[70],"task.":[71],"To":[72],"address":[73],"this,":[74],"we":[75,156],"trained":[76],"a":[77,112,135],"custom":[78],"three-layer":[79],"transformer":[80,139],"analyzed":[86],"model's":[88],"using":[91,111],"both":[92],"empirical":[93],"theoretical":[95],"approaches.":[96],"Our":[97],"findings":[98],"reveal":[99],"that":[100],"model":[102],"forms":[103],"initial":[105],"estimate":[106],"target":[109],"sequence":[110],"copying":[113],"mechanism":[114],"in":[115,127,141],"zeroth":[117],"layer,":[118],"which":[119],"subsequently":[121],"refined":[122],"through":[123,152],"negative":[124],"similarity":[125],"heads":[126],"second":[129],"layer.":[130],"These":[131],"insights":[132],"contribute":[133],"deeper":[136],"understanding":[137],"behaviors":[140],"recursive":[142],"tasks":[143],"offer":[145],"potential":[146],"avenues":[147],"improving":[149],"Finally,":[155],"discuss":[157],"implications":[159],"our":[161],"results":[162],"future":[164],"work,":[165],"including":[166],"extensions":[167],"higher-dimensional":[169],"exploration":[173],"polynomial":[175],"sequences.":[176]},"counts_by_year":[],"updated_date":"2026-03-13T16:22:10.518609","created_date":"2025-10-10T00:00:00"}
