{"id":"https://openalex.org/W4311726128","doi":"https://doi.org/10.48550/arxiv.2212.07677","title":"Transformers learn in-context by gradient descent","display_name":"Transformers learn in-context by gradient descent","publication_year":2022,"publication_date":"2022-12-15","ids":{"openalex":"https://openalex.org/W4311726128","doi":"https://doi.org/10.48550/arxiv.2212.07677"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2212.07677","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2212.07677","pdf_url":"https://arxiv.org/pdf/2212.07677","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2212.07677","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103956420","display_name":"Johannes von Oswald","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"von Oswald, Johannes","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083787955","display_name":"Eyvind Niklasson","orcid":"https://orcid.org/0009-0001-1488-9037"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Niklasson, Eyvind","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061878865","display_name":"Ettore Randazzo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Randazzo, Ettore","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030804320","display_name":"Jo\u00e3o Sacramento","orcid":"https://orcid.org/0000-0002-2837-9695"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sacramento, Jo\u00e3o","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022951041","display_name":"Alexander Mordvintsev","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mordvintsev, Alexander","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113984302","display_name":"Andrey Zhmoginov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhmoginov, Andrey","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5067411442","display_name":"Max Vladymyrov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vladymyrov, Max","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5103956420"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":89,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9833999872207642,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9833999872207642,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9761999845504761,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9648000001907349,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.7229516506195068},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6740728616714478},{"id":"https://openalex.org/keywords/gradient-descent","display_name":"Gradient descent","score":0.6410987377166748},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5411317944526672},{"id":"https://openalex.org/keywords/stochastic-gradient-descent","display_name":"Stochastic gradient descent","score":0.4826689064502716},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.411639928817749},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.12242823839187622},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.11401382088661194}],"concepts":[{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7229516506195068},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6740728616714478},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.6410987377166748},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5411317944526672},{"id":"https://openalex.org/C206688291","wikidata":"https://www.wikidata.org/wiki/Q7617819","display_name":"Stochastic gradient descent","level":3,"score":0.4826689064502716},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.411639928817749},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.12242823839187622},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.11401382088661194},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2212.07677","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2212.07677","pdf_url":"https://arxiv.org/pdf/2212.07677","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2212.07677","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2212.07677","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2212.07677","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2212.07677","pdf_url":"https://arxiv.org/pdf/2212.07677","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2961085424","https://openalex.org/W4306674287","https://openalex.org/W3046775127","https://openalex.org/W3107602296","https://openalex.org/W3170094116","https://openalex.org/W4386462264","https://openalex.org/W4364306694","https://openalex.org/W4312192474","https://openalex.org/W4283697347","https://openalex.org/W4210805261"],"abstract_inverted_index":{"At":[0],"present,":[1],"the":[2,46,85,97,103,129,137,156,225],"mechanisms":[3],"of":[4,48,131,140,158,213],"in-context":[5,141,194,214],"learning":[6,142,163,195,215,219],"in":[7,119,128,143],"Transformers":[8,25,79,91,110,154],"are":[9],"not":[10],"well":[11],"understood":[12,208],"and":[13,59,90,168,202],"remain":[14],"mostly":[15],"an":[16,164],"intuition.":[17],"In":[18],"this":[19,148],"paper,":[20],"we":[21,72,106,150,182],"suggest":[22],"that":[23,44,70,75],"training":[24,77],"on":[26,64,80,147,172],"auto-regressive":[27],"objectives":[28],"is":[29],"closely":[30],"related":[31],"to":[32,134,176,186,190,223],"gradient-based":[33],"meta-learning":[34],"formulations.":[35],"We":[36],"start":[37],"by":[38,52,60,69,88,100,116,162,216],"providing":[39],"a":[40,54,65,187,210],"simple":[41,81],"weight":[42],"construction":[43],"shows":[45],"equivalence":[47],"data":[49,174],"transformations":[50],"induced":[51],"1)":[53],"single":[55],"linear":[56,170],"self-attention":[57],"layer":[58],"2)":[61],"gradient-descent":[62],"(GD)":[63],"regression":[66,82,132,179],"loss.":[67],"Motivated":[68],"construction,":[71],"show":[73,92,107,203],"empirically":[74],"when":[76],"self-attention-only":[78],"tasks":[83],"either":[84],"models":[86,115,171],"learned":[87],"GD":[89],"great":[93],"similarity":[94],"or,":[95],"remarkably,":[96],"weights":[98],"found":[99,229],"optimization":[101],"match":[102],"construction.":[104],"Thus":[105],"how":[108,153,204],"trained":[109],"become":[111],"mesa-optimizers":[112],"i.e.":[113],"learn":[114,169],"gradient":[117,160,217],"descent":[118,161,218],"their":[120],"forward":[121],"pass.":[122],"This":[123],"allows":[124],"us,":[125],"at":[126,230],"least":[127],"domain":[130],"problems,":[133],"mechanistically":[135],"understand":[136],"inner":[138],"workings":[139],"optimized":[144],"Transformers.":[145,221],"Building":[146],"insight,":[149],"furthermore":[151],"identify":[152],"surpass":[155],"performance":[157],"plain":[159],"iterative":[165],"curvature":[166],"correction":[167],"deep":[173],"representations":[175],"solve":[177],"non-linear":[178],"tasks.":[180],"Finally,":[181],"discuss":[183],"intriguing":[184],"parallels":[185],"mechanism":[188],"identified":[189],"be":[191,207,228],"crucial":[192],"for":[193],"termed":[196],"induction-head":[197],"(Olsson":[198],"et":[199],"al.,":[200],"2022)":[201],"it":[205],"could":[206],"as":[209],"specific":[211],"case":[212],"within":[220],"Code":[222],"reproduce":[224],"experiments":[226],"can":[227],"https://github.com/google-research/self-organising-systems/tree/master/transformers_learn_icl_by_gd":[231],".":[232]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":21},{"year":2024,"cited_by_count":46},{"year":2023,"cited_by_count":21}],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2025-10-10T00:00:00"}
