{"id":"https://openalex.org/W4226322717","doi":"https://doi.org/10.1145/3505711.3505715","title":"On Large-Batch Training of Residual Networks with SignSGD","display_name":"On Large-Batch Training of Residual Networks with SignSGD","publication_year":2021,"publication_date":"2021-11-20","ids":{"openalex":"https://openalex.org/W4226322717","doi":"https://doi.org/10.1145/3505711.3505715"},"language":"en","primary_location":{"id":"doi:10.1145/3505711.3505715","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3505711.3505715","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 The 5th International Conference on Advances in Artificial Intelligence (ICAAI)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5047184449","display_name":"Alex Xavier","orcid":"https://orcid.org/0000-0003-4094-0681"},"institutions":[{"id":"https://openalex.org/I195740183","display_name":"University of Moratuwa","ror":"https://ror.org/0491f5305","country_code":"LK","type":"education","lineage":["https://openalex.org/I195740183"]}],"countries":["LK"],"is_corresponding":true,"raw_author_name":"Alex Xavier","raw_affiliation_strings":["CODEGEN QBiTS Lab, University of Moratuwa,Sri Lanka, Sri Lanka and Computer Science and Engineering, University of Moratuwa,Sri Lanka, Sri Lanka"],"affiliations":[{"raw_affiliation_string":"CODEGEN QBiTS Lab, University of Moratuwa,Sri Lanka, Sri Lanka and Computer Science and Engineering, University of Moratuwa,Sri Lanka, Sri Lanka","institution_ids":["https://openalex.org/I195740183"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055449884","display_name":"Dumindu Tissera","orcid":"https://orcid.org/0000-0002-7461-0165"},"institutions":[{"id":"https://openalex.org/I195740183","display_name":"University of Moratuwa","ror":"https://ror.org/0491f5305","country_code":"LK","type":"education","lineage":["https://openalex.org/I195740183"]}],"countries":["LK"],"is_corresponding":false,"raw_author_name":"Dumindu Tissera","raw_affiliation_strings":["CODEGEN QBiTS Lab, University of Moratuwa,Sri Lanka, Sri Lanka"],"affiliations":[{"raw_affiliation_string":"CODEGEN QBiTS Lab, University of Moratuwa,Sri Lanka, Sri Lanka","institution_ids":["https://openalex.org/I195740183"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030666586","display_name":"Rukshan Wijesinghe","orcid":"https://orcid.org/0000-0002-4003-6337"},"institutions":[{"id":"https://openalex.org/I195740183","display_name":"University of Moratuwa","ror":"https://ror.org/0491f5305","country_code":"LK","type":"education","lineage":["https://openalex.org/I195740183"]}],"countries":["LK"],"is_corresponding":false,"raw_author_name":"Rukshan Wijesinghe","raw_affiliation_strings":["CODEGEN QBiTS Lab, University of Moratuwa,Sri Lanka, Sri Lanka"],"affiliations":[{"raw_affiliation_string":"CODEGEN QBiTS Lab, University of Moratuwa,Sri Lanka, Sri Lanka","institution_ids":["https://openalex.org/I195740183"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020016899","display_name":"Kasun Vithanage","orcid":"https://orcid.org/0000-0003-2798-1876"},"institutions":[{"id":"https://openalex.org/I195740183","display_name":"University of Moratuwa","ror":"https://ror.org/0491f5305","country_code":"LK","type":"education","lineage":["https://openalex.org/I195740183"]}],"countries":["LK"],"is_corresponding":false,"raw_author_name":"Kasun Vithanage","raw_affiliation_strings":["CODEGEN QBiTS Lab, University of Moratuwa,Sri Lanka, Sri Lanka"],"affiliations":[{"raw_affiliation_string":"CODEGEN QBiTS Lab, University of Moratuwa,Sri Lanka, Sri Lanka","institution_ids":["https://openalex.org/I195740183"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058175691","display_name":"Ranga Rodrigo","orcid":"https://orcid.org/0000-0002-1034-7513"},"institutions":[{"id":"https://openalex.org/I195740183","display_name":"University of Moratuwa","ror":"https://ror.org/0491f5305","country_code":"LK","type":"education","lineage":["https://openalex.org/I195740183"]}],"countries":["LK"],"is_corresponding":false,"raw_author_name":"Ranga Rodrigo","raw_affiliation_strings":["CODEGEN QBiTS Lab, University of Moratuwa,Sri Lanka, Sri Lanka"],"affiliations":[{"raw_affiliation_string":"CODEGEN QBiTS Lab, University of Moratuwa,Sri Lanka, Sri Lanka","institution_ids":["https://openalex.org/I195740183"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085751708","display_name":"Subha Fernando","orcid":"https://orcid.org/0000-0002-2621-5291"},"institutions":[{"id":"https://openalex.org/I195740183","display_name":"University of Moratuwa","ror":"https://ror.org/0491f5305","country_code":"LK","type":"education","lineage":["https://openalex.org/I195740183"]}],"countries":["LK"],"is_corresponding":false,"raw_author_name":"Subha Fernando","raw_affiliation_strings":["CODEGEN QBiTS Lab, University of Moratuwa,Sri Lanka, Sri Lanka"],"affiliations":[{"raw_affiliation_string":"CODEGEN QBiTS Lab, University of Moratuwa,Sri Lanka, Sri Lanka","institution_ids":["https://openalex.org/I195740183"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5046054771","display_name":"Sanath Jayasena","orcid":"https://orcid.org/0000-0001-5097-8769"},"institutions":[{"id":"https://openalex.org/I195740183","display_name":"University of Moratuwa","ror":"https://ror.org/0491f5305","country_code":"LK","type":"education","lineage":["https://openalex.org/I195740183"]}],"countries":["LK"],"is_corresponding":false,"raw_author_name":"Sanath Jayasena","raw_affiliation_strings":["Computer Science and Engineering, University of Moratuwa,Sri Lanka, Sri Lanka"],"affiliations":[{"raw_affiliation_string":"Computer Science and Engineering, University of Moratuwa,Sri Lanka, Sri Lanka","institution_ids":["https://openalex.org/I195740183"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5047184449"],"corresponding_institution_ids":["https://openalex.org/I195740183"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.19232026,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"23","last_page":"30"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/stochastic-gradient-descent","display_name":"Stochastic gradient descent","score":0.7943199872970581},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.7450630068778992},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7076489925384521},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.6647979021072388},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.642957866191864},{"id":"https://openalex.org/keywords/momentum","display_name":"Momentum (technical analysis)","score":0.6195757985115051},{"id":"https://openalex.org/keywords/gradient-descent","display_name":"Gradient descent","score":0.6054637432098389},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.6015101075172424},{"id":"https://openalex.org/keywords/traverse","display_name":"Traverse","score":0.581427276134491},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5292783379554749},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5115314722061157},{"id":"https://openalex.org/keywords/sign","display_name":"Sign (mathematics)","score":0.49334779381752014},{"id":"https://openalex.org/keywords/deep-neural-networks","display_name":"Deep neural networks","score":0.4802118241786957},{"id":"https://openalex.org/keywords/variance","display_name":"Variance (accounting)","score":0.47641026973724365},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.44494229555130005},{"id":"https://openalex.org/keywords/residual-neural-network","display_name":"Residual neural network","score":0.421818345785141},{"id":"https://openalex.org/keywords/limit","display_name":"Limit (mathematics)","score":0.41590234637260437},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.3652384281158447},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.2490222156047821},{"id":"https://openalex.org/keywords/geology","display_name":"Geology","score":0.06315702199935913}],"concepts":[{"id":"https://openalex.org/C206688291","wikidata":"https://www.wikidata.org/wiki/Q7617819","display_name":"Stochastic gradient descent","level":3,"score":0.7943199872970581},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.7450630068778992},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7076489925384521},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.6647979021072388},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.642957866191864},{"id":"https://openalex.org/C60718061","wikidata":"https://www.wikidata.org/wiki/Q1414747","display_name":"Momentum (technical analysis)","level":2,"score":0.6195757985115051},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.6054637432098389},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.6015101075172424},{"id":"https://openalex.org/C176809094","wikidata":"https://www.wikidata.org/wiki/Q15401496","display_name":"Traverse","level":2,"score":0.581427276134491},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5292783379554749},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5115314722061157},{"id":"https://openalex.org/C139676723","wikidata":"https://www.wikidata.org/wiki/Q1193832","display_name":"Sign (mathematics)","level":2,"score":0.49334779381752014},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.4802118241786957},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.47641026973724365},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.44494229555130005},{"id":"https://openalex.org/C2944601119","wikidata":"https://www.wikidata.org/wiki/Q43744058","display_name":"Residual neural network","level":3,"score":0.421818345785141},{"id":"https://openalex.org/C151201525","wikidata":"https://www.wikidata.org/wiki/Q177239","display_name":"Limit (mathematics)","level":2,"score":0.41590234637260437},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3652384281158447},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2490222156047821},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.06315702199935913},{"id":"https://openalex.org/C10138342","wikidata":"https://www.wikidata.org/wiki/Q43015","display_name":"Finance","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C50522688","wikidata":"https://www.wikidata.org/wiki/Q189833","display_name":"Economic growth","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C121955636","wikidata":"https://www.wikidata.org/wiki/Q4116214","display_name":"Accounting","level":1,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3505711.3505715","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3505711.3505715","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 The 5th International Conference on Advances in Artificial Intelligence (ICAAI)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":7,"referenced_works":["https://openalex.org/W569478347","https://openalex.org/W2143908786","https://openalex.org/W2194775991","https://openalex.org/W2407022425","https://openalex.org/W2786602455","https://openalex.org/W2963433607","https://openalex.org/W4240768087"],"related_works":["https://openalex.org/W4206903459","https://openalex.org/W2754816816","https://openalex.org/W4366280654","https://openalex.org/W3160167280","https://openalex.org/W4231621013","https://openalex.org/W4362706668","https://openalex.org/W3008318776","https://openalex.org/W2041416246","https://openalex.org/W3020853991","https://openalex.org/W3035836947"],"abstract_inverted_index":{"Large-batch":[0],"training":[1,31,70],"of":[2,25,104,109,143,162],"deep":[3],"neural":[4,111],"networks":[5],"(DNN)":[6],"has":[7,28],"recently":[8],"been":[9],"widely":[10],"studied,":[11],"since":[12],"traversing":[13],"the":[14,23,48,84,110,141,166],"optimization":[15],"landscape":[16],"is":[17,36,44,63],"faster":[18],"with":[19,52,71,133,151,158],"large":[20,53,87],"batches":[21,88],"and":[22,155],"emergence":[24],"parallel":[26],"computing":[27],"made":[29],"large-batch":[30,68],"feasible.":[32],"However,":[33],"its":[34,91,115,119],"generalization":[35,73,92],"still":[37],"worse":[38],"compared":[39,74],"to":[40,47,75,82],"small-batch":[41],"training,":[42],"which":[43,99],"commonly":[45],"attributed":[46],"low":[49],"gradient":[50,60,116,120],"variance":[51],"batches.":[54],"We":[55,78,136],"show":[56,137],"that":[57,138],"sign":[58,117],"stochastic":[59],"descent":[61],"(signSGD)":[62],"a":[64,107],"suitable":[65],"candidate":[66],"for":[67,86,131],"ResNet":[69],"improved":[72],"vanilla":[76,153],"SGD.":[77],"further":[79],"modify":[80],"signSGD":[81,129,144,150],"improve":[83],"convergence":[85,142],"while":[89,145],"retaining":[90],"properties.":[93],"In":[94],"particular,":[95],"we":[96],"propose":[97],"r-signSGD,":[98],"restricts":[100],"certain":[101],"parameter":[102,108],"updates":[103,106,126],"signSGD\u2014r-signSGD":[105],"network":[112],"only":[113],"if":[114],"matches":[118],"momentum":[121,159],"sign.":[122],"This":[123],"prevents":[124],"unnecessary/wrong":[125],"given":[127],"by":[128],"especially":[130],"parameters":[132],"small":[134],"gradients.":[135],"r-signSGD":[139],"improves":[140],"also":[146],"performing":[147],"better":[148],"than":[149],"momentum,":[152],"SGD,":[154],"even":[156],"SGD":[157],"in":[160,165],"terms":[161],"test":[163],"accuracy":[164],"CIFAR10":[167],"image":[168],"classification":[169],"task.":[170]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
