{"id":"https://openalex.org/W7139099463","doi":"https://doi.org/10.48550/arxiv.2603.15958","title":"Deriving Hyperparameter Scaling Laws via Modern Optimization Theory","display_name":"Deriving Hyperparameter Scaling Laws via Modern Optimization Theory","publication_year":2026,"publication_date":"2026-03-16","ids":{"openalex":"https://openalex.org/W7139099463","doi":"https://doi.org/10.48550/arxiv.2603.15958"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.15958","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15958","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.15958","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025477585","display_name":"Egor Shulgin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shulgin, Egor","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129834478","display_name":"Dimitri von R\u00fctte","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"von R\u00fctte, Dimitri","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070304382","display_name":"Tinghua Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Tianyue H.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129774365","display_name":"Niccol\u00f2 Ajroldi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ajroldi, Niccol\u00f2","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130107152","display_name":"Bernhard Sch\u00f6lkopf","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sch\u00f6lkopf, Bernhard","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129972840","display_name":"Antonio Orvieto","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Orvieto, Antonio","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.9221000075340271,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.9221000075340271,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.019600000232458115,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.009200000204145908,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/hyperparameter","display_name":"Hyperparameter","score":0.8213000297546387},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.5084999799728394},{"id":"https://openalex.org/keywords/oracle","display_name":"Oracle","score":0.4767000079154968},{"id":"https://openalex.org/keywords/minification","display_name":"Minification","score":0.4449000060558319},{"id":"https://openalex.org/keywords/quadratic-equation","display_name":"Quadratic equation","score":0.43939998745918274},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.4309999942779541},{"id":"https://openalex.org/keywords/rate-of-convergence","display_name":"Rate of convergence","score":0.40880000591278076},{"id":"https://openalex.org/keywords/transfer-of-learning","display_name":"Transfer of learning","score":0.40310001373291016},{"id":"https://openalex.org/keywords/bayesian-optimization","display_name":"Bayesian optimization","score":0.3443000018596649}],"concepts":[{"id":"https://openalex.org/C8642999","wikidata":"https://www.wikidata.org/wiki/Q4171168","display_name":"Hyperparameter","level":2,"score":0.8213000297546387},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.5809000134468079},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5317000150680542},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.5084999799728394},{"id":"https://openalex.org/C55166926","wikidata":"https://www.wikidata.org/wiki/Q2892946","display_name":"Oracle","level":2,"score":0.4767000079154968},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.4449000060558319},{"id":"https://openalex.org/C129844170","wikidata":"https://www.wikidata.org/wiki/Q41299","display_name":"Quadratic equation","level":2,"score":0.43939998745918274},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.4309999942779541},{"id":"https://openalex.org/C57869625","wikidata":"https://www.wikidata.org/wiki/Q1783502","display_name":"Rate of convergence","level":3,"score":0.40880000591278076},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.40310001373291016},{"id":"https://openalex.org/C2778049539","wikidata":"https://www.wikidata.org/wiki/Q17002908","display_name":"Bayesian optimization","level":2,"score":0.3443000018596649},{"id":"https://openalex.org/C183115368","wikidata":"https://www.wikidata.org/wiki/Q856577","display_name":"Weighting","level":2,"score":0.33880001306533813},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.337799996137619},{"id":"https://openalex.org/C10485038","wikidata":"https://www.wikidata.org/wiki/Q48996162","display_name":"Hyperparameter optimization","level":3,"score":0.3361000120639801},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.33559998869895935},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3269999921321869},{"id":"https://openalex.org/C42747912","wikidata":"https://www.wikidata.org/wiki/Q1048447","display_name":"Multiplicative function","level":2,"score":0.31529998779296875},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.31290000677108765},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.30309998989105225},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2955000102519989},{"id":"https://openalex.org/C132459708","wikidata":"https://www.wikidata.org/wiki/Q744069","display_name":"Extrapolation","level":2,"score":0.28940001130104065},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.27309998869895935},{"id":"https://openalex.org/C60718061","wikidata":"https://www.wikidata.org/wiki/Q1414747","display_name":"Momentum (technical analysis)","level":2,"score":0.2653999924659729},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.2632000148296356},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.26260000467300415},{"id":"https://openalex.org/C2780148112","wikidata":"https://www.wikidata.org/wiki/Q1432581","display_name":"Proxy (statistics)","level":2,"score":0.26249998807907104},{"id":"https://openalex.org/C206688291","wikidata":"https://www.wikidata.org/wiki/Q7617819","display_name":"Stochastic gradient descent","level":3,"score":0.26080000400543213},{"id":"https://openalex.org/C41045048","wikidata":"https://www.wikidata.org/wiki/Q202843","display_name":"Linear programming","level":2,"score":0.2599000036716461},{"id":"https://openalex.org/C97256817","wikidata":"https://www.wikidata.org/wiki/Q1462316","display_name":"Spurious relationship","level":2,"score":0.25850000977516174},{"id":"https://openalex.org/C107321475","wikidata":"https://www.wikidata.org/wiki/Q5374254","display_name":"Empirical risk minimization","level":2,"score":0.257099986076355}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.15958","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15958","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.15958","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15958","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Hyperparameter":[0],"transfer":[1,20,25],"has":[2],"become":[3],"an":[4],"important":[5],"component":[6],"of":[7,61,113],"modern":[8,55],"large-scale":[9],"training":[10,30],"recipes.":[11],"Existing":[12],"methods,":[13],"such":[14],"as":[15,90,111],"muP,":[16],"primarily":[17],"focus":[18],"on":[19,34,68],"between":[21,154],"model":[22,122],"sizes,":[23],"with":[24,139,166],"across":[26,96],"batch":[27,109],"sizes":[28],"and":[29,46,83,93,108,128,136,156],"horizons":[31],"often":[32],"relying":[33],"empirical":[35],"scaling":[36,52,168],"rules":[37],"informed":[38],"by":[39],"insights":[40,127],"from":[41,130],"timescale":[42],"preservation,":[43],"quadratic":[44],"proxies,":[45],"continuous-time":[47],"approximations.":[48],"We":[49],"study":[50],"hyperparameter":[51],"laws":[53],"for":[54,65,104,143],"first-order":[56],"optimizers":[57],"through":[58],"the":[59,69,114,131,152],"lens":[60],"recent":[62,88],"convergence":[63],"bounds":[64,86],"methods":[66],"based":[67],"Linear":[70],"Minimization":[71],"Oracle":[72],"(LMO),":[73],"a":[74,91,134],"framework":[75],"that":[76,160],"includes":[77],"normalized":[78],"SGD,":[79],"signSGD":[80],"(approximating":[81],"Adam),":[82],"Muon.":[84],"Treating":[85],"in":[87],"literature":[89,132],"proxy":[92],"minimizing":[94],"them":[95],"different":[97],"tuning":[98],"regimes":[99],"yields":[100],"closed-form":[101],"power-law":[102],"schedules":[103],"learning":[105],"rate,":[106],"momentum,":[107],"size":[110,123],"functions":[112],"iteration":[115],"or":[116],"token":[117],"budget.":[118],"Our":[119,146],"analysis,":[120],"holding":[121],"fixed,":[124],"recovers":[125],"most":[126],"observations":[129],"under":[133],"unified":[135],"principled":[137],"perspective,":[138],"clear":[140],"directions":[141],"open":[142],"future":[144],"research.":[145],"results":[147],"draw":[148],"particular":[149],"attention":[150],"to":[151],"interaction":[153],"momentum":[155],"batch-size":[157],"scaling,":[158],"suggesting":[159],"optimal":[161],"performance":[162],"may":[163],"be":[164],"achieved":[165],"several":[167],"strategies.":[169]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-20T00:00:00"}
