{"id":"https://openalex.org/W7140325881","doi":"https://doi.org/10.48550/arxiv.2603.22801","title":"Transformers Trained via Gradient Descent Can Provably Learn a Class of Teacher Models","display_name":"Transformers Trained via Gradient Descent Can Provably Learn a Class of Teacher Models","publication_year":2026,"publication_date":"2026-03-24","ids":{"openalex":"https://openalex.org/W7140325881","doi":"https://doi.org/10.48550/arxiv.2603.22801"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.22801","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22801","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.22801","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130569084","display_name":"Chenyang Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Chenyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130580368","display_name":"Qingyue Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Qingyue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120284571","display_name":"Quanquan Gu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Quanquan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130559253","display_name":"Yuan Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Yuan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5130569084"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.4302999973297119,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.4302999973297119,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.10909999907016754,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.08399999886751175,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.64410001039505},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.3874000012874603},{"id":"https://openalex.org/keywords/population","display_name":"Population","score":0.35519999265670776},{"id":"https://openalex.org/keywords/gradient-descent","display_name":"Gradient descent","score":0.3488999903202057},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.33739998936653137},{"id":"https://openalex.org/keywords/nonlinear-system","display_name":"Nonlinear system","score":0.3287999927997589}],"concepts":[{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.64410001039505},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5676000118255615},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4514000117778778},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42080000042915344},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.3874000012874603},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.35589998960494995},{"id":"https://openalex.org/C2908647359","wikidata":"https://www.wikidata.org/wiki/Q2625603","display_name":"Population","level":2,"score":0.35519999265670776},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.3488999903202057},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.33739998936653137},{"id":"https://openalex.org/C158622935","wikidata":"https://www.wikidata.org/wiki/Q660848","display_name":"Nonlinear system","level":2,"score":0.3287999927997589},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.3091000020503998},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.2590000033378601},{"id":"https://openalex.org/C205203396","wikidata":"https://www.wikidata.org/wiki/Q612143","display_name":"Bilinear interpolation","level":2,"score":0.2581999897956848},{"id":"https://openalex.org/C45357846","wikidata":"https://www.wikidata.org/wiki/Q2001982","display_name":"Notation","level":2,"score":0.25519999861717224},{"id":"https://openalex.org/C191795146","wikidata":"https://www.wikidata.org/wiki/Q3878446","display_name":"Norm (philosophy)","level":2,"score":0.25119999051094055}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.22801","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22801","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.22801","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22801","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.8567620515823364}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Transformers":[0],"have":[1],"achieved":[2],"great":[3],"success":[4,17],"across":[5],"a":[6,44,73,149,166],"wide":[7],"range":[8],"of":[9,26,46,75,101,119,134,152],"applications,":[10],"yet":[11],"the":[12,23,50,120,125,131],"theoretical":[13],"foundations":[14],"underlying":[15],"their":[16],"remain":[18],"largely":[19],"unexplored.":[20],"To":[21],"demystify":[22],"strong":[24],"capacities":[25],"transformers":[27,38,108,136],"applied":[28],"to":[29,41,148,164,178],"versatile":[30],"scenarios":[31],"and":[32,66,88],"tasks,":[33,174],"we":[34,104,140],"theoretically":[35],"investigate":[36],"utilizing":[37],"as":[39,189],"students":[40],"learn":[42],"from":[43,98],"class":[45,100,151],"teacher":[47,51,102,121,138],"models.":[48],"Specifically,":[49],"models":[52,79],"covered":[53],"in":[54,160],"our":[55,161],"analysis":[56,162],"include":[57],"convolution":[58,64],"layers":[59],"with":[60,109],"average":[61],"pooling,":[62],"graph":[63],"layers,":[65],"various":[67,172],"classic":[68],"statistical":[69],"learning":[70,97,173,181],"models,":[71,103,122,139],"including":[72],"variant":[74],"sparse":[76],"token":[77],"selection":[78],"[Sanford":[80],"et":[81,85,93],"al.,":[82,86,94],"2023,":[83],"Wang":[84],"2024]":[87],"group-sparse":[89],"linear":[90],"predictors":[91],"[Zhang":[92],"2025].":[95],"When":[96],"this":[99],"prove":[105],"that":[106,143],"one-layer":[107],"simplified":[110],"\"position-only''":[111],"attention":[112],"can":[113,145],"successfully":[114],"recover":[115],"all":[116],"parameter":[117],"blocks":[118],"thus":[123],"achieving":[124],"optimal":[126],"population":[127],"loss.":[128],"Building":[129],"upon":[130],"efficient":[132],"mimicry":[133],"trained":[135],"towards":[137],"further":[141],"demonstrate":[142],"they":[144],"generalize":[146],"well":[147],"broad":[150],"out-of-distribution":[153],"data":[154],"under":[155],"mild":[156],"assumptions.":[157],"The":[158],"key":[159],"is":[163],"identify":[165],"fundamental":[167],"bilinear":[168],"structure":[169],"shared":[170],"by":[171],"which":[175],"enables":[176],"us":[177],"establish":[179],"unified":[180],"guarantees":[182],"for":[183,191],"these":[184],"tasks":[185],"when":[186],"treating":[187],"them":[188],"teachers":[190],"transformers.":[192]},"counts_by_year":[],"updated_date":"2026-03-26T06:10:45.909354","created_date":"2026-03-26T00:00:00"}
