{"id":"https://openalex.org/W2808402975","doi":"https://doi.org/10.24963/ijcai.2018/410","title":"A Unified Analysis of Stochastic Momentum Methods for Deep Learning","display_name":"A Unified Analysis of Stochastic Momentum Methods for Deep Learning","publication_year":2018,"publication_date":"2018-07-01","ids":{"openalex":"https://openalex.org/W2808402975","doi":"https://doi.org/10.24963/ijcai.2018/410","mag":"2808402975"},"language":"en","primary_location":{"id":"doi:10.24963/ijcai.2018/410","is_oa":true,"landing_page_url":"https://doi.org/10.24963/ijcai.2018/410","pdf_url":"https://www.ijcai.org/proceedings/2018/0410.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Twenty-Seventh International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.ijcai.org/proceedings/2018/0410.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100395035","display_name":"Yan Yan","orcid":"https://orcid.org/0000-0001-9108-6767"},"institutions":[{"id":"https://openalex.org/I114017466","display_name":"University of Technology Sydney","ror":"https://ror.org/03f0f6041","country_code":"AU","type":"education","lineage":["https://openalex.org/I114017466"]},{"id":"https://openalex.org/I3045169105","display_name":"Southern University of Science and Technology","ror":"https://ror.org/049tv2d57","country_code":"CN","type":"education","lineage":["https://openalex.org/I3045169105"]}],"countries":["AU","CN"],"is_corresponding":true,"raw_author_name":"Yan Yan","raw_affiliation_strings":["Centre for Artificial Intelligence, University of Technology Sydney","SUSTech-UTS Joint Centre of CIS, Southern University of Science and Technology"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Centre for Artificial Intelligence, University of Technology Sydney","institution_ids":["https://openalex.org/I114017466"]},{"raw_affiliation_string":"SUSTech-UTS Joint Centre of CIS, Southern University of Science and Technology","institution_ids":["https://openalex.org/I3045169105"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023288846","display_name":"Tianbao Yang","orcid":"https://orcid.org/0000-0002-7858-5438"},"institutions":[{"id":"https://openalex.org/I126307644","display_name":"University of Iowa","ror":"https://ror.org/036jqmy94","country_code":"US","type":"education","lineage":["https://openalex.org/I126307644"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tianbao Yang","raw_affiliation_strings":["Department of Computer Science, The University of Iowa"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science, The University of Iowa","institution_ids":["https://openalex.org/I126307644"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100356707","display_name":"Zhe Li","orcid":"https://orcid.org/0000-0003-2779-582X"},"institutions":[{"id":"https://openalex.org/I126307644","display_name":"University of Iowa","ror":"https://ror.org/036jqmy94","country_code":"US","type":"education","lineage":["https://openalex.org/I126307644"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhe Li","raw_affiliation_strings":["Department of Computer Science, The University of Iowa"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science, The University of Iowa","institution_ids":["https://openalex.org/I126307644"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090417160","display_name":"Qihang Lin","orcid":"https://orcid.org/0000-0003-2943-3267"},"institutions":[{"id":"https://openalex.org/I126307644","display_name":"University of Iowa","ror":"https://ror.org/036jqmy94","country_code":"US","type":"education","lineage":["https://openalex.org/I126307644"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Qihang Lin","raw_affiliation_strings":["Tippie College of Business, The University of Iowa"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tippie College of Business, The University of Iowa","institution_ids":["https://openalex.org/I126307644"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5005421447","display_name":"Yi Yang","orcid":"https://orcid.org/0000-0002-0512-880X"},"institutions":[{"id":"https://openalex.org/I114017466","display_name":"University of Technology Sydney","ror":"https://ror.org/03f0f6041","country_code":"AU","type":"education","lineage":["https://openalex.org/I114017466"]},{"id":"https://openalex.org/I3045169105","display_name":"Southern University of Science and Technology","ror":"https://ror.org/049tv2d57","country_code":"CN","type":"education","lineage":["https://openalex.org/I3045169105"]}],"countries":["AU","CN"],"is_corresponding":false,"raw_author_name":"Yi Yang","raw_affiliation_strings":["Centre for Artificial Intelligence, University of Technology Sydney","SUSTech-UTS Joint Centre of CIS, Southern University of Science and Technology"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Centre for Artificial Intelligence, University of Technology Sydney","institution_ids":["https://openalex.org/I114017466"]},{"raw_affiliation_string":"SUSTech-UTS Joint Centre of CIS, Southern University of Science and Technology","institution_ids":["https://openalex.org/I3045169105"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100395035"],"corresponding_institution_ids":["https://openalex.org/I114017466","https://openalex.org/I3045169105"],"apc_list":null,"apc_paid":null,"fwci":7.2748,"has_fulltext":true,"cited_by_count":89,"citation_normalized_percentile":{"value":0.97628042,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"2955","last_page":"2961"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10500","display_name":"Sparse and Compressive Sensing Techniques","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12676","display_name":"Machine Learning and ELM","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.661182165145874},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.63592129945755},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6281416416168213},{"id":"https://openalex.org/keywords/stochastic-gradient-descent","display_name":"Stochastic gradient descent","score":0.5850863456726074},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.5544756650924683},{"id":"https://openalex.org/keywords/momentum","display_name":"Momentum (technical analysis)","score":0.49917006492614746},{"id":"https://openalex.org/keywords/norm","display_name":"Norm (philosophy)","score":0.49674445390701294},{"id":"https://openalex.org/keywords/applied-mathematics","display_name":"Applied mathematics","score":0.48627936840057373},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.4776657521724701},{"id":"https://openalex.org/keywords/mathematical-optimization","display_name":"Mathematical optimization","score":0.4646183252334595},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4472466707229614},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4449065029621124},{"id":"https://openalex.org/keywords/rate-of-convergence","display_name":"Rate of convergence","score":0.43402034044265747},{"id":"https://openalex.org/keywords/stochastic-optimization","display_name":"Stochastic optimization","score":0.43221038579940796},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.3177228569984436},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.2844482660293579},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.07585600018501282},{"id":"https://openalex.org/keywords/mathematical-analysis","display_name":"Mathematical analysis","score":0.07389235496520996}],"concepts":[{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.661182165145874},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.63592129945755},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6281416416168213},{"id":"https://openalex.org/C206688291","wikidata":"https://www.wikidata.org/wiki/Q7617819","display_name":"Stochastic gradient descent","level":3,"score":0.5850863456726074},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.5544756650924683},{"id":"https://openalex.org/C60718061","wikidata":"https://www.wikidata.org/wiki/Q1414747","display_name":"Momentum (technical analysis)","level":2,"score":0.49917006492614746},{"id":"https://openalex.org/C191795146","wikidata":"https://www.wikidata.org/wiki/Q3878446","display_name":"Norm (philosophy)","level":2,"score":0.49674445390701294},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.48627936840057373},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.4776657521724701},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.4646183252334595},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4472466707229614},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4449065029621124},{"id":"https://openalex.org/C57869625","wikidata":"https://www.wikidata.org/wiki/Q1783502","display_name":"Rate of convergence","level":3,"score":0.43402034044265747},{"id":"https://openalex.org/C194387892","wikidata":"https://www.wikidata.org/wiki/Q1747770","display_name":"Stochastic optimization","level":2,"score":0.43221038579940796},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3177228569984436},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2844482660293579},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.07585600018501282},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.07389235496520996},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C10138342","wikidata":"https://www.wikidata.org/wiki/Q43015","display_name":"Finance","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C50522688","wikidata":"https://www.wikidata.org/wiki/Q189833","display_name":"Economic growth","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.24963/ijcai.2018/410","is_oa":true,"landing_page_url":"https://doi.org/10.24963/ijcai.2018/410","pdf_url":"https://www.ijcai.org/proceedings/2018/0410.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Twenty-Seventh International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},{"id":"pmh:oai:opus.lib.uts.edu.au:10453/131496","is_oa":false,"landing_page_url":"http://hdl.handle.net/10453/131496","pdf_url":null,"source":{"id":"https://openalex.org/S4306401357","display_name":"UTS ePRESS (University of Technology Sydney)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I114017466","host_organization_name":"University of Technology Sydney","host_organization_lineage":["https://openalex.org/I114017466"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Conference Proceeding"}],"best_oa_location":{"id":"doi:10.24963/ijcai.2018/410","is_oa":true,"landing_page_url":"https://doi.org/10.24963/ijcai.2018/410","pdf_url":"https://www.ijcai.org/proceedings/2018/0410.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Twenty-Seventh International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G7902954316","display_name":null,"funder_award_id":"1545995","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2808402975.pdf","grobid_xml":"https://content.openalex.org/works/W2808402975.grobid-xml"},"referenced_works_count":22,"referenced_works":["https://openalex.org/W104184427","https://openalex.org/W1811750039","https://openalex.org/W1984481261","https://openalex.org/W1987083649","https://openalex.org/W1988720110","https://openalex.org/W2000462146","https://openalex.org/W2054959047","https://openalex.org/W2079705627","https://openalex.org/W2105828468","https://openalex.org/W2109339818","https://openalex.org/W2137731592","https://openalex.org/W2160815625","https://openalex.org/W2163605009","https://openalex.org/W2301983558","https://openalex.org/W2304667012","https://openalex.org/W2415274228","https://openalex.org/W2963470657","https://openalex.org/W3141595720","https://openalex.org/W4236574868","https://openalex.org/W4294622503","https://openalex.org/W4294815180","https://openalex.org/W6715800825"],"related_works":["https://openalex.org/W3007093918","https://openalex.org/W4286899070","https://openalex.org/W4308706819","https://openalex.org/W4323366756","https://openalex.org/W2913271688","https://openalex.org/W4234593354","https://openalex.org/W2893302333","https://openalex.org/W4285259204","https://openalex.org/W4389775782","https://openalex.org/W3207830353"],"abstract_inverted_index":{"Stochastic":[0],"momentum":[1,52,134],"methods":[2,53],"have":[3,122],"been":[4],"widely":[5],"adopted":[6],"in":[7],"training":[8,20,115],"deep":[9,166],"neural":[10],"networks.":[11],"However,":[12,127],"their":[13],"theoretical":[14,151],"analysis":[15,112,130,164],"of":[16,18,68,89,92,113,140],"convergence":[17,87,111],"the":[19,23,36,44,50,59,65,80,86,90,95,101,105,110,114,128,133,138,141,147,154],"objective":[21,116],"and":[22,40,49,64,99,120,144,157],"generalization":[24,102,148],"error":[25],"for":[26,94],"prediction":[27],"is":[28],"still":[29],"under-explored.":[30],"This":[31],"paper":[32],"aims":[33],"to":[34],"bridge":[35],"gap":[37],"between":[38],"practice":[39],"theory":[41],"by":[42,161],"analyzing":[43],"stochastic":[45,51,60,66],"gradient":[46,71,93],"(SG)":[47],"method,":[48],"including":[54],"two":[55],"famous":[56],"variants,":[57],"i.e.,":[58],"heavy-ball":[61],"(SHB)":[62],"method":[63],"variant":[67],"Nesterov?s":[69],"accelerated":[70],"(SNAG)":[72],"method.":[73],"We":[74,83],"propose":[75],"a":[76],"framework":[77],"that":[78,118,132],"unifies":[79],"three":[81],"variants.":[82],"then":[84],"derive":[85],"rates":[88],"norm":[91],"non-convex":[96],"optimization":[97],"problem,":[98],"analyze":[100],"performance":[103],"through":[104],"uniform":[106],"stability":[107,129,139],"approach.":[108],"Particularly,":[109],"exhibits":[117],"SHB":[119],"SNAG":[121],"no":[123],"advantage":[124],"over":[125],"SG.":[126],"shows":[131],"term":[135],"can":[136],"improve":[137,146],"learned":[142],"model":[143],"hence":[145],"performance.":[149],"These":[150],"insights":[152],"verify":[153],"common":[155],"wisdom":[156],"are":[158],"also":[159],"corroborated":[160],"our":[162],"empirical":[163],"on":[165],"learning.":[167]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":10},{"year":2024,"cited_by_count":8},{"year":2023,"cited_by_count":13},{"year":2022,"cited_by_count":13},{"year":2021,"cited_by_count":17},{"year":2020,"cited_by_count":15},{"year":2019,"cited_by_count":11}],"updated_date":"2026-05-26T13:28:51.108037","created_date":"2025-10-10T00:00:00"}
