{"id":"https://openalex.org/W2885142640","doi":"https://doi.org/10.1109/cahpc.2018.8645935","title":"Large Scale Language Modeling: Converging on 40GB of Text in Four Hours","display_name":"Large Scale Language Modeling: Converging on 40GB of Text in Four Hours","publication_year":2018,"publication_date":"2018-09-01","ids":{"openalex":"https://openalex.org/W2885142640","doi":"https://doi.org/10.1109/cahpc.2018.8645935","mag":"2885142640"},"language":"en","primary_location":{"id":"doi:10.1109/cahpc.2018.8645935","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cahpc.2018.8645935","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2018 30th International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5041667198","display_name":"Raul Puri","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Raul Puri","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083523340","display_name":"Robert M. Kirby","orcid":"https://orcid.org/0000-0001-5712-4141"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Robert Kirby","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027112782","display_name":"Nikolai Yakovenko","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nikolai Yakovenko","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5066242985","display_name":"Bryan Catanzaro","orcid":"https://orcid.org/0000-0003-0034-7728"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bryan Catanzaro","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5041667198"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.466,"has_fulltext":false,"cited_by_count":13,"citation_normalized_percentile":{"value":0.86558382,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"290","last_page":"297"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8304125070571899},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6732529401779175},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5638939738273621},{"id":"https://openalex.org/keywords/recurrent-neural-network","display_name":"Recurrent neural network","score":0.527043879032135},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.4958079755306244},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.48328545689582825},{"id":"https://openalex.org/keywords/transfer-of-learning","display_name":"Transfer of learning","score":0.461737722158432},{"id":"https://openalex.org/keywords/multiplicative-function","display_name":"Multiplicative function","score":0.45883578062057495},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.43921831250190735},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.42314907908439636},{"id":"https://openalex.org/keywords/function","display_name":"Function (biology)","score":0.4147636592388153},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.3759550154209137},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.07829636335372925},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.0772237479686737}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8304125070571899},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6732529401779175},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5638939738273621},{"id":"https://openalex.org/C147168706","wikidata":"https://www.wikidata.org/wiki/Q1457734","display_name":"Recurrent neural network","level":3,"score":0.527043879032135},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.4958079755306244},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.48328545689582825},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.461737722158432},{"id":"https://openalex.org/C42747912","wikidata":"https://www.wikidata.org/wiki/Q1048447","display_name":"Multiplicative function","level":2,"score":0.45883578062057495},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.43921831250190735},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.42314907908439636},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.4147636592388153},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3759550154209137},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.07829636335372925},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0772237479686737},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C78458016","wikidata":"https://www.wikidata.org/wiki/Q840400","display_name":"Evolutionary biology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cahpc.2018.8645935","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cahpc.2018.8645935","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2018 30th International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":92,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1539309091","https://openalex.org/W1566289585","https://openalex.org/W1686810756","https://openalex.org/W1815076433","https://openalex.org/W1951216520","https://openalex.org/W2027731328","https://openalex.org/W2062118960","https://openalex.org/W2101234009","https://openalex.org/W2108598243","https://openalex.org/W2117539524","https://openalex.org/W2153579005","https://openalex.org/W2155541015","https://openalex.org/W2158139315","https://openalex.org/W2170973209","https://openalex.org/W2194775991","https://openalex.org/W2250539671","https://openalex.org/W2252143850","https://openalex.org/W2284050935","https://openalex.org/W2338908902","https://openalex.org/W2507974895","https://openalex.org/W2555428947","https://openalex.org/W2606347107","https://openalex.org/W2606833507","https://openalex.org/W2611669587","https://openalex.org/W2617242334","https://openalex.org/W2622263826","https://openalex.org/W2749988060","https://openalex.org/W2759465730","https://openalex.org/W2763421725","https://openalex.org/W2766164908","https://openalex.org/W2769856846","https://openalex.org/W2775461895","https://openalex.org/W2784121710","https://openalex.org/W2794557536","https://openalex.org/W2804935296","https://openalex.org/W2806311723","https://openalex.org/W2899771611","https://openalex.org/W2950726992","https://openalex.org/W2952729433","https://openalex.org/W2962739339","https://openalex.org/W2962835968","https://openalex.org/W2963045354","https://openalex.org/W2963069632","https://openalex.org/W2963090765","https://openalex.org/W2963112338","https://openalex.org/W2963150697","https://openalex.org/W2963403868","https://openalex.org/W2963685250","https://openalex.org/W2963702144","https://openalex.org/W2963756346","https://openalex.org/W2963807318","https://openalex.org/W2964019776","https://openalex.org/W2964121744","https://openalex.org/W4239072543","https://openalex.org/W4294170691","https://openalex.org/W4294375521","https://openalex.org/W4299300709","https://openalex.org/W4300822525","https://openalex.org/W4302343710","https://openalex.org/W4385245566","https://openalex.org/W6629028937","https://openalex.org/W6632248436","https://openalex.org/W6637373629","https://openalex.org/W6638545294","https://openalex.org/W6640820311","https://openalex.org/W6675354045","https://openalex.org/W6677651945","https://openalex.org/W6682691769","https://openalex.org/W6682778277","https://openalex.org/W6683557909","https://openalex.org/W6685158001","https://openalex.org/W6687483927","https://openalex.org/W6695676441","https://openalex.org/W6703652217","https://openalex.org/W6727057065","https://openalex.org/W6729503108","https://openalex.org/W6736923733","https://openalex.org/W6739622702","https://openalex.org/W6739901393","https://openalex.org/W6742080785","https://openalex.org/W6743289643","https://openalex.org/W6744711479","https://openalex.org/W6745245109","https://openalex.org/W6745410505","https://openalex.org/W6745558287","https://openalex.org/W6746514494","https://openalex.org/W6746932923","https://openalex.org/W6747727460","https://openalex.org/W6747808108","https://openalex.org/W6748148878","https://openalex.org/W6749879876"],"related_works":["https://openalex.org/W4225394202","https://openalex.org/W4298287631","https://openalex.org/W2953061907","https://openalex.org/W1847088711","https://openalex.org/W3036642985","https://openalex.org/W3032952384","https://openalex.org/W4318195686","https://openalex.org/W4251867247","https://openalex.org/W3017902212","https://openalex.org/W2964335273"],"abstract_inverted_index":{"Recent":[0,131],"work":[1,104,132,215],"has":[2,133],"shown":[3],"how":[4],"to":[5,25,71,108,162,182,223],"train":[6,72,109],"Convolutional":[7],"Neural":[8,44],"Networks":[9,45],"(CNNs)":[10],"rapidly":[11],"on":[12],"large":[13,124,218],"image":[14],"datasets":[15,242],"[1],":[16],"then":[17],"transfer":[18,41],"the":[19,88,110,119,136,151,194],"knowledge":[20],"gained":[21],"from":[22],"these":[23],"models":[24,127],"a":[26,57,73,140,155,174,185],"variety":[27],"of":[28,87,142,157,204],"tasks":[29],"[2].":[30],"Following":[31],"[3],":[32],"in":[33,95,198],"this":[34,170,214],"work,":[35],"we":[36,68,146],"demonstrate":[37],"similar":[38],"scalability":[39],"and":[40,56,113,200,227],"for":[42,47,80,115,169],"Recurrent":[43],"(RNNs)":[46],"Natural":[48],"Language":[49],"tasks.":[50],"By":[51],"utilizing":[52],"mixed":[53],"precision":[54],"arithmetic":[55],"32k":[58,186],"batch":[59,125,143,158,187],"size":[60,112,159],"distributed":[61],"across":[62],"128":[63,205],"NVIDIA":[64],"Tesla":[65,206],"V100":[66,207],"GPUs,":[67,208],"are":[69],"able":[70],"character-level":[74],"4096-dimension":[75],"multiplicative":[76],"LSTM":[77],"(mLSTM)":[78],"[4]":[79],"unsupervised":[81,220],"text":[82,241],"reconstruction":[83],"over":[84,118,193,236],"3":[85],"epochs":[86],"40":[89],"GB":[90],"Amazon":[91,195],"Reviews":[92,196],"dataset":[93,121,197],"[5]":[94],"four":[96],"hours.":[97],"This":[98],"runtime":[99],"compares":[100],"favorably":[101],"with":[102,184],"previous":[103],"taking":[105],"one":[106,116],"month":[107],"same":[111,120],"configuration":[114],"epoch":[117],"[3].":[122],"Converging":[123],"RNN":[126],"can":[128,233],"be":[129,234],"challenging.":[130],"suggested":[134],"scaling":[135,150],"learning":[137,152,175,229],"rate":[138,153,176],"as":[139,154],"function":[141,156],"size,":[144],"but":[145],"find":[147],"that":[148,178],"simply":[149],"leads":[160],"either":[161],"significantly":[163],"worse":[164],"convergence":[165],"or":[166,239],"immediate":[167],"divergence":[168],"problem.":[171],"We":[172],"provide":[173],"schedule":[177],"allows":[179],"our":[180,190,201],"model":[181,191,232],"converge":[183],"size.":[188],"Since":[189],"converges":[192],"hours,":[199],"compute":[202],"requirement":[203],"while":[209],"substantial,":[210],"is":[211],"commercially":[212],"available,":[213],"opens":[216],"up":[217],"scale":[219],"NLP":[221],"training":[222],"most":[224,237],"commercial":[225],"applications":[226],"deep":[228],"researchers.":[230],"A":[231],"trained":[235],"public":[238],"private":[240],"overnight.":[243]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2022,"cited_by_count":3},{"year":2020,"cited_by_count":3},{"year":2019,"cited_by_count":5},{"year":2018,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
