{"id":"https://openalex.org/W7155005078","doi":"https://doi.org/10.48550/arxiv.2604.15416","title":"StoSignSGD: Unbiased Structural Stochasticity Fixes SignSGD for Training Large Language Models","display_name":"StoSignSGD: Unbiased Structural Stochasticity Fixes SignSGD for Training Large Language Models","publication_year":2026,"publication_date":"2026-04-16","ids":{"openalex":"https://openalex.org/W7155005078","doi":"https://doi.org/10.48550/arxiv.2604.15416"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.15416","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.15416","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.15416","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5109689425","display_name":"Dingzhi Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Dingzhi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134071750","display_name":"Rui Pan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Rui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134008348","display_name":"Yuxing Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yuxing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100378800","display_name":"Tong Zhang","orcid":"https://orcid.org/0000-0002-7025-6365"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Tong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.1574999988079071,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.1574999988079071,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.14499999582767487,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.13089999556541443,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.5839999914169312},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5817000269889832},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5435000061988831},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.5121999979019165},{"id":"https://openalex.org/keywords/operator","display_name":"Operator (biology)","score":0.5041999816894531},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.41920000314712524},{"id":"https://openalex.org/keywords/core","display_name":"Core (optical fiber)","score":0.40869998931884766},{"id":"https://openalex.org/keywords/sign","display_name":"Sign (mathematics)","score":0.4081999957561493}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6859999895095825},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.5839999914169312},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5817000269889832},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5435000061988831},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.5121999979019165},{"id":"https://openalex.org/C17020691","wikidata":"https://www.wikidata.org/wiki/Q139677","display_name":"Operator (biology)","level":5,"score":0.5041999816894531},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4668999910354614},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4307999908924103},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.41920000314712524},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.40869998931884766},{"id":"https://openalex.org/C139676723","wikidata":"https://www.wikidata.org/wiki/Q1193832","display_name":"Sign (mathematics)","level":2,"score":0.4081999957561493},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.39719998836517334},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.3741999864578247},{"id":"https://openalex.org/C112680207","wikidata":"https://www.wikidata.org/wiki/Q714886","display_name":"Regular polygon","level":2,"score":0.3346000015735626},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.33090001344680786},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.33070001006126404},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.33009999990463257},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.30469998717308044},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.3041999936103821},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.2935999929904938},{"id":"https://openalex.org/C57869625","wikidata":"https://www.wikidata.org/wiki/Q1783502","display_name":"Rate of convergence","level":3,"score":0.28790000081062317},{"id":"https://openalex.org/C166052673","wikidata":"https://www.wikidata.org/wiki/Q83021","display_name":"Empirical evidence","level":2,"score":0.27709999680519104},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2703000009059906},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.26440000534057617},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.260699987411499},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.25679999589920044}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.15416","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.15416","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.15416","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.15416","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Sign-based":[0],"optimization":[1],"algorithms,":[2],"such":[3],"as":[4],"SignSGD,":[5,90],"have":[6],"garnered":[7],"significant":[8],"attention":[9],"for":[10],"their":[11,23],"remarkable":[12,164],"performance":[13,185],"in":[14,37,145],"distributed":[15],"learning":[16,40],"and":[17,45,133,161,190,227],"training":[18,142],"large":[19,138],"foundation":[20],"models.":[21],"Despite":[22],"empirical":[24],"superiority,":[25],"SignSGD":[26],"is":[27],"known":[28],"to":[29,42,166,170,193,233],"diverge":[30],"on":[31,178],"non-smooth":[32,105],"objectives,":[33],"which":[34],"are":[35],"ubiquitous":[36],"modern":[38],"machine":[39],"due":[41],"ReLUs,":[43],"max-pools,":[44],"mixture-of-experts.":[46],"To":[47],"overcome":[48],"this":[49,218],"fundamental":[50],"limitation,":[51],"we":[52,107,200,220],"propose":[53],"\\textbf{StoSignSGD},":[54],"an":[55,67],"algorithm":[56],"that":[57,82,112,117],"injects":[58],"structural":[59],"stochasticity":[60],"into":[61,212],"the":[62,72,86,97,101,121,195,222],"sign":[63,203],"operator":[64],"while":[65],"maintaining":[66],"unbiased":[68],"update":[69],"step.":[70],"In":[71],"regime":[73],"of":[74,89,207,225],"(online)":[75],"convex":[76],"optimization,":[77,106],"our":[78,236],"theoretical":[79],"analysis":[80],"shows":[81],"StoSignSGD":[83,118,129,157,182,226],"rigorously":[84],"resolves":[85],"non-convergence":[87],"issues":[88],"achieving":[91],"a":[92,150,163,202,229],"sharp":[93],"convergence":[94],"rate":[95],"matching":[96],"lower":[98],"bound.":[99],"For":[100],"more":[102],"challenging":[103],"non-convex":[104],"introduce":[108],"generalized":[109],"stationary":[110],"measures":[111],"encompass":[113],"prior":[114],"definitions,":[115],"proving":[116],"improves":[119],"upon":[120],"best-known":[122],"complexity":[123],"bounds":[124],"by":[125],"dimensional":[126],"factors.":[127],"Empirically,":[128],"exhibits":[130],"robust":[131],"stability":[132],"superior":[134],"efficiency":[135],"across":[136],"diverse":[137],"language":[139],"model":[140],"(LLM)":[141],"regimes.":[143],"Notably,":[144],"low-precision":[146],"FP8":[147],"pretraining":[148],"--":[149,156],"setting":[151],"where":[152],"AdamW":[153,189],"fails":[154],"catastrophically":[155],"remains":[158],"highly":[159],"stable":[160],"yields":[162],"1.44$\\times$":[165],"2.14$\\times$":[167],"speedup":[168],"relative":[169],"established":[171],"baselines.":[172],"Furthermore,":[173],"when":[174],"fine-tuning":[175],"7B":[176],"LLMs":[177],"mathematical":[179],"reasoning":[180],"tasks,":[181],"delivers":[183],"substantial":[184],"gains":[186],"over":[187],"both":[188],"SignSGD.":[191],"Finally,":[192],"dissect":[194],"mechanisms":[196],"driving":[197],"its":[198,213],"success,":[199],"develop":[201],"conversion":[204],"framework":[205],"capable":[206],"transforming":[208],"any":[209],"general":[210],"optimizer":[211],"unbiased,":[214],"sign-based":[215],"counterpart.":[216],"Utilizing":[217],"framework,":[219],"deconstruct":[221],"core":[223],"components":[224],"present":[228],"comprehensive":[230],"ablation":[231],"study":[232],"empirically":[234],"validate":[235],"algorithmic":[237],"design":[238],"choices.":[239]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-21T00:00:00"}
