{"id":"https://openalex.org/W4403662640","doi":"https://doi.org/10.48550/arxiv.2408.13359","title":"Power Scheduler: A Batch Size and Token Number Agnostic Learning Rate Scheduler","display_name":"Power Scheduler: A Batch Size and Token Number Agnostic Learning Rate Scheduler","publication_year":2024,"publication_date":"2024-08-23","ids":{"openalex":"https://openalex.org/W4403662640","doi":"https://doi.org/10.48550/arxiv.2408.13359"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2408.13359","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.13359","pdf_url":"https://arxiv.org/pdf/2408.13359","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2408.13359","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073742611","display_name":"Yikang Shen","orcid":"https://orcid.org/0000-0001-6836-0510"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shen, Yikang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093875966","display_name":"Matthew Stallone","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stallone, Matthew","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101772090","display_name":"Mayank Mishra","orcid":"https://orcid.org/0000-0001-5034-6964"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mishra, Mayank","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025337794","display_name":"Gaoyuan Zhang","orcid":"https://orcid.org/0000-0003-3374-4092"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Gaoyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089182977","display_name":"Shawn Tan","orcid":"https://orcid.org/0000-0002-0489-7455"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Shawn","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101995679","display_name":"Aditya Prasad","orcid":"https://orcid.org/0000-0001-6967-6824"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Prasad, Aditya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065662096","display_name":"Adriana Meza Soria","orcid":"https://orcid.org/0000-0003-4392-9242"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Soria, Adriana Meza","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100763986","display_name":"David Cox","orcid":"https://orcid.org/0000-0002-2189-9743"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cox, David D.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5049734237","display_name":"Rameswar Panda","orcid":"https://orcid.org/0000-0003-4359-2475"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Panda, Rameswar","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5073742611"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.8270000219345093,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.8270000219345093,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11032","display_name":"VLSI and Analog Circuit Testing","score":0.7835999727249146,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11283","display_name":"Experimental Learning in Engineering","score":0.7511000037193298,"subfield":{"id":"https://openalex.org/subfields/2214","display_name":"Media Technology"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6887794733047485},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6204053163528442},{"id":"https://openalex.org/keywords/power","display_name":"Power (physics)","score":0.43095892667770386},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.32874155044555664}],"concepts":[{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6887794733047485},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6204053163528442},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.43095892667770386},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.32874155044555664},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2408.13359","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.13359","pdf_url":"https://arxiv.org/pdf/2408.13359","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2408.13359","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2408.13359","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2408.13359","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.13359","pdf_url":"https://arxiv.org/pdf/2408.13359","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4403662640.pdf","grobid_xml":"https://content.openalex.org/works/W4403662640.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W4388335561","https://openalex.org/W2970530566","https://openalex.org/W4288261899","https://openalex.org/W4307309205","https://openalex.org/W2967478618","https://openalex.org/W4385009901","https://openalex.org/W4385572700"],"abstract_inverted_index":{"Finding":[0],"the":[1,75,85,102,118,132,158,173,186,206,227],"optimal":[2,76,121],"learning":[3,24,122,164],"rate":[4,165],"for":[5,49,93,131],"language":[6,51,236],"model":[7,32,94,154,213,217],"pretraining":[8],"is":[9,14,19,41,88,111,170],"a":[10,20,46,144,162],"challenging":[11],"task.":[12],"This":[13],"not":[15],"only":[16],"because":[17,39],"there":[18],"complicated":[21],"correlation":[22,119],"between":[23,120,147],"rate,":[25,123],"batch":[26,124,179,211],"size,":[27,33,125,212,214],"number":[28,127,174,207],"of":[29,57,128,139,175,202,205,208],"training":[30,129,176,209],"tokens,":[31,210],"and":[34,66,73,81,90,100,126,149,178,215,222],"other":[35],"hyperparameters":[36,203],"but":[37],"also":[38],"it":[40],"prohibitively":[42],"expensive":[43],"to":[44,69,78,108],"perform":[45,70],"hyperparameter":[47,71],"search":[48],"large":[50,79,82,109],"models":[52,65,80,224,242],"with":[53,189,199,226],"Billions":[54],"or":[55],"Trillions":[56],"parameters.":[58],"Recent":[59],"studies":[60],"propose":[61,161],"using":[62],"small":[63,67,106,140,235],"proxy":[64],"corpus":[68,107,110],"searches":[72],"transposing":[74],"parameters":[77],"corpus.":[83],"While":[84],"zero-shot":[86,103],"transferability":[87,152],"theoretically":[89],"empirically":[91],"proven":[92],"size":[95],"related":[96],"hyperparameters,":[97],"like":[98],"depth":[99],"width,":[101],"transfer":[104],"from":[105],"underexplored.":[112],"In":[113],"this":[114],"paper,":[115],"we":[116,142,160],"study":[117],"tokens":[130,177],"recently":[133],"proposed":[134],"WSD":[135],"scheduler.":[136],"After":[137],"thousands":[138],"experiments,":[141],"found":[143],"power-law":[145],"relationship":[146],"variables":[148],"demonstrated":[150],"its":[151],"across":[153],"sizes.":[155],"Based":[156],"on":[157],"observation,":[159],"new":[163],"scheduler,":[166,168],"Power":[167,187,228],"that":[169,184],"agnostic":[171],"about":[172],"size.":[180],"The":[181],"experiment":[182],"shows":[183],"combining":[185],"scheduler":[188,229],"Maximum":[190],"Update":[191],"Parameterization":[192],"(muP)":[193],"can":[194],"consistently":[195],"achieve":[196,230],"impressive":[197],"performance":[198,232],"one":[200],"set":[201],"regardless":[204],"even":[216],"architecture.":[218],"Our":[219],"3B":[220],"dense":[221],"MoE":[223],"trained":[225],"comparable":[231],"as":[233],"state-of-the-art":[234],"models.":[237],"We":[238],"open-source":[239],"these":[240],"pretrained":[241],"at":[243],"https://ibm.biz/BdKhLa.":[244]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2025-10-10T00:00:00"}
