{"id":"https://openalex.org/W4292749535","doi":"https://doi.org/10.3233/jcm-226215","title":"Parameter rollback averaged stochastic gradient descent for language model","display_name":"Parameter rollback averaged stochastic gradient descent for language model","publication_year":2022,"publication_date":"2022-08-23","ids":{"openalex":"https://openalex.org/W4292749535","doi":"https://doi.org/10.3233/jcm-226215"},"language":"en","primary_location":{"id":"doi:10.3233/jcm-226215","is_oa":false,"landing_page_url":"https://doi.org/10.3233/jcm-226215","pdf_url":null,"source":{"id":"https://openalex.org/S2765058733","display_name":"Journal of Computational Methods in Sciences and Engineering","issn_l":"1472-7978","issn":["1472-7978","1875-8983"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310318577","host_organization_name":"IOS Press","host_organization_lineage":["https://openalex.org/P4310318577"],"host_organization_lineage_names":["IOS Press"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Computational Methods in Sciences and Engineering","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zhao Cheng","orcid":null},"institutions":[{"id":"https://openalex.org/I4210153482","display_name":"Changzhou University","ror":"https://ror.org/04ymgwq66","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210153482"]},{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhao Cheng","raw_affiliation_strings":["School of Computer and Computing Science, Zhejiang University City College, Hangzhou, Zhejiang, China","School of Computing Science and Artificial Intelligence, Changzhou University, Changzhou, Jiangsu, China","School of Computer and Computing Science, Zhejiang University City College, Hangzhou, Zhejiang, China; School of Computing Science and Artificial Intelligence, Changzhou University, Changzhou, Jiangsu, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer and Computing Science, Zhejiang University City College, Hangzhou, Zhejiang, China","institution_ids":["https://openalex.org/I76130692"]},{"raw_affiliation_string":"School of Computing Science and Artificial Intelligence, Changzhou University, Changzhou, Jiangsu, China","institution_ids":["https://openalex.org/I4210153482"]},{"raw_affiliation_string":"School of Computer and Computing Science, Zhejiang University City College, Hangzhou, Zhejiang, China; School of Computing Science and Artificial Intelligence, Changzhou University, Changzhou, Jiangsu, China","institution_ids":["https://openalex.org/I4210153482","https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102970671","display_name":"Guanlin Chen","orcid":"https://orcid.org/0009-0004-4735-0582"},"institutions":[{"id":"https://openalex.org/I4210153482","display_name":"Changzhou University","ror":"https://ror.org/04ymgwq66","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210153482"]},{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Guanlin Chen","raw_affiliation_strings":["School of Computer and Computing Science, Zhejiang University City College, Hangzhou, Zhejiang, China","School of Computing Science and Artificial Intelligence, Changzhou University, Changzhou, Jiangsu, China","School of Computer and Computing Science, Zhejiang University City College, Hangzhou, Zhejiang, China; School of Computing Science and Artificial Intelligence, Changzhou University, Changzhou, Jiangsu, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer and Computing Science, Zhejiang University City College, Hangzhou, Zhejiang, China","institution_ids":["https://openalex.org/I76130692"]},{"raw_affiliation_string":"School of Computing Science and Artificial Intelligence, Changzhou University, Changzhou, Jiangsu, China","institution_ids":["https://openalex.org/I4210153482"]},{"raw_affiliation_string":"School of Computer and Computing Science, Zhejiang University City College, Hangzhou, Zhejiang, China; School of Computing Science and Artificial Intelligence, Changzhou University, Changzhou, Jiangsu, China","institution_ids":["https://openalex.org/I4210153482","https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104164017","display_name":"Wenyong Weng","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenyong Weng","raw_affiliation_strings":["School of Computer and Computing Science, Zhejiang University City College, Hangzhou, Zhejiang, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer and Computing Science, Zhejiang University City College, Hangzhou, Zhejiang, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101958491","display_name":"Qi Lu","orcid":"https://orcid.org/0000-0003-1741-9281"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qi Lu","raw_affiliation_strings":["China National Air Separation Engineering Co., Ltd, Hangzhou, Zhejiang, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"China National Air Separation Engineering Co., Ltd, Hangzhou, Zhejiang, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5065976782","display_name":"Wujian Yang","orcid":"https://orcid.org/0000-0002-9107-6849"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wujian Yang","raw_affiliation_strings":["School of Computer and Computing Science, Zhejiang University City College, Hangzhou, Zhejiang, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer and Computing Science, Zhejiang University City College, Hangzhou, Zhejiang, China","institution_ids":["https://openalex.org/I76130692"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5102970671"],"corresponding_institution_ids":["https://openalex.org/I4210153482","https://openalex.org/I76130692"],"apc_list":null,"apc_paid":null,"fwci":0.1387,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.54491644,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"22","issue":"6","first_page":"2375","last_page":"2385"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9954000115394592,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/rollback","display_name":"Rollback","score":0.944298267364502},{"id":"https://openalex.org/keywords/overfitting","display_name":"Overfitting","score":0.866396427154541},{"id":"https://openalex.org/keywords/treebank","display_name":"Treebank","score":0.8483263254165649},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7506561279296875},{"id":"https://openalex.org/keywords/perplexity","display_name":"Perplexity","score":0.6138432025909424},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.6119270324707031},{"id":"https://openalex.org/keywords/dropout","display_name":"Dropout (neural networks)","score":0.5736809372901917},{"id":"https://openalex.org/keywords/stochastic-gradient-descent","display_name":"Stochastic gradient descent","score":0.5705463290214539},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4987938404083252},{"id":"https://openalex.org/keywords/gradient-descent","display_name":"Gradient descent","score":0.4735673666000366},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.4422680735588074},{"id":"https://openalex.org/keywords/retraining","display_name":"Retraining","score":0.4277935326099396},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3433157205581665},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.21605965495109558},{"id":"https://openalex.org/keywords/dependency","display_name":"Dependency (UML)","score":0.15876713395118713},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.1364378035068512},{"id":"https://openalex.org/keywords/database-transaction","display_name":"Database transaction","score":0.0925634503364563},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.0815613865852356}],"concepts":[{"id":"https://openalex.org/C174220543","wikidata":"https://www.wikidata.org/wiki/Q395307","display_name":"Rollback","level":3,"score":0.944298267364502},{"id":"https://openalex.org/C22019652","wikidata":"https://www.wikidata.org/wiki/Q331309","display_name":"Overfitting","level":3,"score":0.866396427154541},{"id":"https://openalex.org/C206134035","wikidata":"https://www.wikidata.org/wiki/Q811525","display_name":"Treebank","level":3,"score":0.8483263254165649},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7506561279296875},{"id":"https://openalex.org/C100279451","wikidata":"https://www.wikidata.org/wiki/Q372193","display_name":"Perplexity","level":3,"score":0.6138432025909424},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.6119270324707031},{"id":"https://openalex.org/C2776145597","wikidata":"https://www.wikidata.org/wiki/Q25339462","display_name":"Dropout (neural networks)","level":2,"score":0.5736809372901917},{"id":"https://openalex.org/C206688291","wikidata":"https://www.wikidata.org/wiki/Q7617819","display_name":"Stochastic gradient descent","level":3,"score":0.5705463290214539},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4987938404083252},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.4735673666000366},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.4422680735588074},{"id":"https://openalex.org/C2778712577","wikidata":"https://www.wikidata.org/wiki/Q3505966","display_name":"Retraining","level":2,"score":0.4277935326099396},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3433157205581665},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.21605965495109558},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.15876713395118713},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1364378035068512},{"id":"https://openalex.org/C75949130","wikidata":"https://www.wikidata.org/wiki/Q848010","display_name":"Database transaction","level":2,"score":0.0925634503364563},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0815613865852356},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.0},{"id":"https://openalex.org/C155202549","wikidata":"https://www.wikidata.org/wiki/Q178803","display_name":"International trade","level":1,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.3233/jcm-226215","is_oa":false,"landing_page_url":"https://doi.org/10.3233/jcm-226215","pdf_url":null,"source":{"id":"https://openalex.org/S2765058733","display_name":"Journal of Computational Methods in Sciences and Engineering","issn_l":"1472-7978","issn":["1472-7978","1875-8983"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310318577","host_organization_name":"IOS Press","host_organization_lineage":["https://openalex.org/P4310318577"],"host_organization_lineage_names":["IOS Press"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Computational Methods in Sciences and Engineering","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.699999988079071,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W1924770834","https://openalex.org/W2064675550","https://openalex.org/W2086161653","https://openalex.org/W2091812280","https://openalex.org/W2146502635","https://openalex.org/W2293185259","https://openalex.org/W2605488176","https://openalex.org/W2946232455","https://openalex.org/W2963338481","https://openalex.org/W2963433607","https://openalex.org/W2963537482","https://openalex.org/W2964121744","https://openalex.org/W2998704965","https://openalex.org/W2998811572","https://openalex.org/W4230906748","https://openalex.org/W4288333985","https://openalex.org/W4299838440","https://openalex.org/W6680532216","https://openalex.org/W6681435938","https://openalex.org/W6742632731"],"related_works":["https://openalex.org/W2376415519","https://openalex.org/W3142119062","https://openalex.org/W131522978","https://openalex.org/W4301800915","https://openalex.org/W2739034105","https://openalex.org/W2963839582","https://openalex.org/W2964047924","https://openalex.org/W2743945814","https://openalex.org/W4299838440","https://openalex.org/W2962832505"],"abstract_inverted_index":{"Recently,":[0],"AWD-LSTM":[1,15,38,112],"(ASGD":[2],"Weight-Dropped":[3],"LSTM)":[4],"has":[5],"achieved":[6],"good":[7],"result":[8],"in":[9,23,88],"the":[10,41,46,76,85],"language":[11,27,71],"model,":[12],"and":[13,114],"many":[14],"based":[16,110,116],"models":[17,28],"have":[18,29],"obtained":[19],"state-of-the-art":[20],"perplexities.":[21],"However,":[22],"fact,":[24],"large-scale":[25],"neural":[26,70],"been":[30],"shown":[31],"to":[32,35,44,52,93],"be":[33],"prone":[34],"overfitting.":[36],"In":[37,57],"original":[39],"paper,":[40,59],"author":[42],"decided":[43],"adopt":[45],"way":[47],"of":[48,121],"retraining":[49],"calling":[50],"finetune":[51],"get":[53],"a":[54,62,94],"better":[55,102],"result.":[56],"this":[58,98],"we":[60,74,100],"present":[61],"simple":[63],"yet":[64],"effective":[65],"parameter":[66,77,86],"rollback":[67,78],"mechanism":[68],"for":[69],"models.":[72],"And":[73],"introduce":[75],"averaged":[79],"stochastic":[80],"gradient":[81],"descent":[82],"(PR-ASGD),":[83],"wherein":[84],"\u201cstep\u201d":[87],"ASGD":[89],"will":[90],"decrease":[91],"according":[92],"certain":[95],"probability.":[96],"Using":[97],"strategy,":[99],"achieve":[101],"word":[103],"level":[104],"perplexities":[105],"on":[106,111,117],"Penn":[107],"Treebank:":[108],"56.26":[109],"model":[113],"53.57":[115],"AWD-LSTM-MoS":[118],"(AWD-LSTM":[119],"Mixture":[120],"Softmaxes)":[122],"model.":[123]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
