{"id":"https://openalex.org/W4385299157","doi":"https://doi.org/10.1145/3603781.3603827","title":"LanYUAN, a GPT large model using Curriculum Learning and Sparse Attention","display_name":"LanYUAN, a GPT large model using Curriculum Learning and Sparse Attention","publication_year":2023,"publication_date":"2023-05-26","ids":{"openalex":"https://openalex.org/W4385299157","doi":"https://doi.org/10.1145/3603781.3603827"},"language":"en","primary_location":{"id":"doi:10.1145/3603781.3603827","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1145/3603781.3603827","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2023 4th International Conference on Computing, Networks and Internet of Things","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5031339022","display_name":"Gonghai Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I76214153","display_name":"Lanzhou University","ror":"https://ror.org/01mkqqe32","country_code":"CN","type":"education","lineage":["https://openalex.org/I76214153"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Gonghai Zhou","raw_affiliation_strings":["School of Information Science &amp; Engineering, Lanzhou University, China"],"raw_orcid":"https://orcid.org/0009-0005-5520-6000","affiliations":[{"raw_affiliation_string":"School of Information Science &amp; Engineering, Lanzhou University, China","institution_ids":["https://openalex.org/I76214153"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074106465","display_name":"Yuhong Zhang","orcid":"https://orcid.org/0000-0001-6180-4457"},"institutions":[{"id":"https://openalex.org/I76214153","display_name":"Lanzhou University","ror":"https://ror.org/01mkqqe32","country_code":"CN","type":"education","lineage":["https://openalex.org/I76214153"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuhong Zhang","raw_affiliation_strings":["School of Information Science &amp; Engineering, Lanzhou University, China","Supercomputing Center of Lanzhou University, Lanzhou University, China"],"raw_orcid":"https://orcid.org/0000-0001-6180-4457","affiliations":[{"raw_affiliation_string":"School of Information Science &amp; Engineering, Lanzhou University, China","institution_ids":["https://openalex.org/I76214153"]},{"raw_affiliation_string":"Supercomputing Center of Lanzhou University, Lanzhou University, China","institution_ids":["https://openalex.org/I76214153"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043064566","display_name":"Rizhen Hu","orcid":"https://orcid.org/0009-0000-3812-1252"},"institutions":[{"id":"https://openalex.org/I76214153","display_name":"Lanzhou University","ror":"https://ror.org/01mkqqe32","country_code":"CN","type":"education","lineage":["https://openalex.org/I76214153"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rizhen Hu","raw_affiliation_strings":["School of Information Science &amp; Engineering, Lanzhou University, China"],"raw_orcid":"https://orcid.org/0009-0000-3812-1252","affiliations":[{"raw_affiliation_string":"School of Information Science &amp; Engineering, Lanzhou University, China","institution_ids":["https://openalex.org/I76214153"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101719861","display_name":"Yang Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I76214153","display_name":"Lanzhou University","ror":"https://ror.org/01mkqqe32","country_code":"CN","type":"education","lineage":["https://openalex.org/I76214153"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yang Zhang","raw_affiliation_strings":["School of Information Science &amp; Engineering, Lanzhou University, China","Supercomputing Center of Lanzhou University, Lanzhou University, China"],"raw_orcid":"https://orcid.org/0009-0002-5616-9780","affiliations":[{"raw_affiliation_string":"School of Information Science &amp; Engineering, Lanzhou University, China","institution_ids":["https://openalex.org/I76214153"]},{"raw_affiliation_string":"Supercomputing Center of Lanzhou University, Lanzhou University, China","institution_ids":["https://openalex.org/I76214153"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5031339022"],"corresponding_institution_ids":["https://openalex.org/I76214153"],"apc_list":null,"apc_paid":null,"fwci":0.1704,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.54953633,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"265","last_page":"272"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.9848999977111816,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9764000177383423,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8234957456588745},{"id":"https://openalex.org/keywords/parallelism","display_name":"Parallelism (grammar)","score":0.6651191711425781},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6540817022323608},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.5247285962104797},{"id":"https://openalex.org/keywords/data-parallelism","display_name":"Data parallelism","score":0.5182240009307861},{"id":"https://openalex.org/keywords/instruction-level-parallelism","display_name":"Instruction-level parallelism","score":0.4565829634666443},{"id":"https://openalex.org/keywords/curriculum","display_name":"Curriculum","score":0.4502449035644531},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.44636955857276917},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.40841200947761536},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.3256874680519104}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8234957456588745},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.6651191711425781},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6540817022323608},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.5247285962104797},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.5182240009307861},{"id":"https://openalex.org/C140763907","wikidata":"https://www.wikidata.org/wiki/Q2714055","display_name":"Instruction-level parallelism","level":3,"score":0.4565829634666443},{"id":"https://openalex.org/C47177190","wikidata":"https://www.wikidata.org/wiki/Q207137","display_name":"Curriculum","level":2,"score":0.4502449035644531},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.44636955857276917},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.40841200947761536},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3256874680519104},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.0},{"id":"https://openalex.org/C555944384","wikidata":"https://www.wikidata.org/wiki/Q249","display_name":"Wireless","level":2,"score":0.0},{"id":"https://openalex.org/C19417346","wikidata":"https://www.wikidata.org/wiki/Q7922","display_name":"Pedagogy","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3603781.3603827","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1145/3603781.3603827","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2023 4th International Conference on Computing, Networks and Internet of Things","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W1981276685","https://openalex.org/W2296073425","https://openalex.org/W2612690371","https://openalex.org/W2965373594","https://openalex.org/W3001279689","https://openalex.org/W3030163527","https://openalex.org/W3198659451","https://openalex.org/W4292779060","https://openalex.org/W6600013530","https://openalex.org/W6600737549","https://openalex.org/W6605905859","https://openalex.org/W6772383348"],"related_works":["https://openalex.org/W2950520577","https://openalex.org/W2003935582","https://openalex.org/W1991844655","https://openalex.org/W2105992728","https://openalex.org/W1229628","https://openalex.org/W4400951174","https://openalex.org/W74409296","https://openalex.org/W2009213655","https://openalex.org/W2593878938","https://openalex.org/W2494130044"],"abstract_inverted_index":{"In":[0],"2021,":[1],"the":[2,8,30,37,74,94,98,147,155,162,169,179],"Inspur":[3],"AI":[4,9,17],"Research":[5],"Institute":[6],"introduced":[7],"Megatron":[10,43],"Model":[11],"Yuan-1.0,":[12],"a":[13,128,139],"massive":[14],"Chinese":[15,33],"language":[16],"model":[18,24,38,46,83,90,112,171],"containing":[19],"245.7":[20],"billion":[21,164],"parameters.":[22],"This":[23],"surpassed":[25],"OpenAI's":[26],"GPT-3,":[27],"making":[28],"it":[29],"world's":[31],"largest":[32],"NLP":[34],"model.":[35,167],"Although":[36],"was":[39,150],"pre-trained":[40],"using":[41],"Nvidia's":[42,110],"framework":[44],"with":[45,97,109,122,178],"parallelism,":[47,49],"data":[48,107],"and":[50,65,79,86,103,132,154],"pipelining":[51,95],"optimizations,":[52],"there":[53],"is":[54],"still":[55],"room":[56],"for":[57],"improvement":[58,174],"in":[59,175],"terms":[60],"of":[61,76,135],"training":[62,84,148,181],"time,":[63],"cost,":[64],"convergence.":[66],"To":[67],"achieve":[68,115],"better":[69],"performance,":[70],"this":[71],"paper":[72],"investigates":[73],"impacts":[75],"batch":[77],"size":[78],"learning":[80],"rate":[81],"on":[82,118],"time":[85,149],"accuracy":[87],"to":[88,114,161],"balance":[89],"performance.":[91],"We":[92],"replaced":[93],"optimization":[96,141],"more":[99],"efficient":[100],"DeepSpeed":[101],"framework,":[102],"combined":[104],"DeepSpeed's":[105],"ZeRO-based":[106],"parallelism":[108,113],"Megatron-LM":[111],"higher":[116],"performance":[117,173],"Nvidia":[119],"GPU":[120],"clusters":[121],"high-bandwidth":[123],"interconnects.":[124],"Additionally,":[125],"we":[126],"used":[127],"curriculum":[129],"learning-based":[130],"method":[131],"four":[133],"types":[134],"sparse":[136],"attention":[137],"as":[138],"new":[140],"approaches.":[142],"The":[143],"results":[144],"showed":[145],"that":[146],"reduced":[151],"by":[152,158],"20%":[153,159],"throughput":[156],"increased":[157],"compared":[160],"47":[163],"parameters":[165],"Yuan-1.0":[166],"Approximately,":[168],"optimized":[170],"achieved":[172],"downstream":[176],"tasks":[177],"same":[180],"data.":[182]},"counts_by_year":[{"year":2023,"cited_by_count":1}],"updated_date":"2025-12-21T23:12:01.093139","created_date":"2025-10-10T00:00:00"}
