{"id":"https://openalex.org/W2407022425","doi":"https://doi.org/10.21437/interspeech.2014-274","title":"1-bit stochastic gradient descent and its application to data-parallel distributed training of speech DNNs","display_name":"1-bit stochastic gradient descent and its application to data-parallel distributed training of speech DNNs","publication_year":2014,"publication_date":"2014-09-14","ids":{"openalex":"https://openalex.org/W2407022425","doi":"https://doi.org/10.21437/interspeech.2014-274","mag":"2407022425"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2014-274","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2014-274","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2014","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5072932051","display_name":"Frank Seide","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Frank Seide","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101611862","display_name":"Hao Fu","orcid":"https://orcid.org/0000-0002-5199-862X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hao Fu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012153296","display_name":"Jasha Droppo","orcid":"https://orcid.org/0000-0001-6097-0090"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jasha Droppo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100438769","display_name":"Gang Li","orcid":"https://orcid.org/0000-0003-1583-641X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gang Li","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5034476404","display_name":"Dong Yu","orcid":"https://orcid.org/0000-0003-0520-6844"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong Yu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5072932051"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":36.4929,"has_fulltext":false,"cited_by_count":882,"citation_normalized_percentile":{"value":0.99794124,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1058","last_page":"1062"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/stochastic-gradient-descent","display_name":"Stochastic gradient descent","score":0.8034744262695312},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7861311435699463},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5682199597358704},{"id":"https://openalex.org/keywords/gradient-descent","display_name":"Gradient descent","score":0.5122796893119812},{"id":"https://openalex.org/keywords/bit","display_name":"Bit (key)","score":0.4985661506652832},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4605836272239685},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.31844088435173035},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.17362314462661743},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.15603893995285034}],"concepts":[{"id":"https://openalex.org/C206688291","wikidata":"https://www.wikidata.org/wiki/Q7617819","display_name":"Stochastic gradient descent","level":3,"score":0.8034744262695312},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7861311435699463},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5682199597358704},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.5122796893119812},{"id":"https://openalex.org/C117011727","wikidata":"https://www.wikidata.org/wiki/Q1278488","display_name":"Bit (key)","level":2,"score":0.4985661506652832},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4605836272239685},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.31844088435173035},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.17362314462661743},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.15603893995285034},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2014-274","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2014-274","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2014","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W196761320","https://openalex.org/W217970951","https://openalex.org/W1218987319","https://openalex.org/W1498436455","https://openalex.org/W2000200144","https://openalex.org/W2008353316","https://openalex.org/W2012257340","https://openalex.org/W2026369565","https://openalex.org/W2071310251","https://openalex.org/W2093794678","https://openalex.org/W2136922672","https://openalex.org/W2143908786","https://openalex.org/W2146502635","https://openalex.org/W2147768505","https://openalex.org/W2162390675","https://openalex.org/W2164278908","https://openalex.org/W2254556534","https://openalex.org/W2294543795","https://openalex.org/W2394932179","https://openalex.org/W2403040875","https://openalex.org/W2403195671","https://openalex.org/W2951781666"],"related_works":["https://openalex.org/W2895097035","https://openalex.org/W4206903459","https://openalex.org/W2754816816","https://openalex.org/W4366280654","https://openalex.org/W3160167280","https://openalex.org/W3020853991","https://openalex.org/W4362706668","https://openalex.org/W4231621013","https://openalex.org/W3171021120","https://openalex.org/W3008318776"],"abstract_inverted_index":{"We":[0,57],"show":[1],"empirically":[2],"that":[3],"in":[4,142,153],"SGD":[5,48,62],"training":[6,135,148],"of":[7,19,98,129,139,151],"deep":[8],"neural":[9],"networks,":[10],"one":[11,26],"can,":[12],"at":[13,163],"no":[14,17],"or":[15],"nearly":[16],"loss":[18],"accuracy,":[20],"quantize":[21],"the":[22,30],"gradients":[23],"aggressively\u2014to":[24],"but":[25],"bit":[27],"per":[28,101,108],"value\u2014if":[29],"quantization":[31,78],"error":[32],"is":[33],"carried":[34],"forward":[35],"across":[36],"minibatches":[37],"(error":[38],"feedback).":[39],"This":[40,121],"size":[41],"reduction":[42],"makes":[43],"it":[44],"feasible":[45],"to":[46,123],"parallelize":[47],"through":[49],"data-parallelism":[50],"with":[51,67,91,112,117],"fast":[52],"processors":[53],"like":[54],"recent":[55],"GPUs.":[56,120],"implement":[58],"data-parallel":[59],"deterministically":[60],"distributed":[61],"by":[63],"combining":[64],"this":[65],"finding":[66],"AdaGrad,":[68,80],"automatic":[69],"minibatch-size":[70],"selection,":[71],"double":[72],"buffering,":[73],"and":[74,110,131],"model":[75,147],"parallelism.":[76],"Unexpectedly,":[77],"benefits":[79],"giving":[81],"a":[82,87,115,126,164],"small":[83,165],"accuracy":[84,166],"gain.":[85],"For":[86],"typical":[88],"Switchboard":[89],"DNN":[90],"46M":[92],"parameters,":[93],"we":[94],"reach":[95],"computation":[96],"speeds":[97],"27k":[99],"frames":[100],"second":[102],"(kfps)":[103],"when":[104],"using":[105],"2880":[106],"samples":[107],"minibatch,":[109],"51kfps":[111],"16k,":[113],"on":[114,156],"server":[116],"8":[118],"K20X":[119],"corresponds":[122],"speed-ups":[124],"over":[125,137],"single":[127],"GPU":[128],"3.6":[130],"6.3,":[132],"respectively.":[133],"7":[134],"passes":[136],"309h":[138],"data":[140,152],"complete":[141],"under":[143,154],"7h.":[144],"A":[145],"160M-parameter":[146],"processes":[149],"3300h":[150],"16h":[155],"20":[157],"dual-GPU":[158],"servers\u2014a":[159],"10":[160],"times":[161],"speed-up\u2014albeit":[162],"loss.":[167]},"counts_by_year":[{"year":2026,"cited_by_count":6},{"year":2025,"cited_by_count":48},{"year":2024,"cited_by_count":78},{"year":2023,"cited_by_count":83},{"year":2022,"cited_by_count":76},{"year":2021,"cited_by_count":194},{"year":2020,"cited_by_count":146},{"year":2019,"cited_by_count":96},{"year":2018,"cited_by_count":69},{"year":2017,"cited_by_count":39},{"year":2016,"cited_by_count":32},{"year":2015,"cited_by_count":13},{"year":2014,"cited_by_count":2}],"updated_date":"2026-03-28T08:17:26.163206","created_date":"2025-10-10T00:00:00"}
