{"id":"https://openalex.org/W3012121966","doi":"https://doi.org/10.1109/iiswc47752.2019.9042047","title":"Characterizing Deep Learning Training Workloads on Alibaba-PAI","display_name":"Characterizing Deep Learning Training Workloads on Alibaba-PAI","publication_year":2019,"publication_date":"2019-11-01","ids":{"openalex":"https://openalex.org/W3012121966","doi":"https://doi.org/10.1109/iiswc47752.2019.9042047","mag":"3012121966"},"language":"en","primary_location":{"id":"doi:10.1109/iiswc47752.2019.9042047","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iiswc47752.2019.9042047","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE International Symposium on Workload Characterization (IISWC)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100707460","display_name":"Mengdi Wang","orcid":"https://orcid.org/0000-0002-2101-9507"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Mengdi Wang","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100357843","display_name":"Meng Chen","orcid":"https://orcid.org/0000-0001-5476-8662"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chen Meng","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103437255","display_name":"Guoping Long","orcid":"https://orcid.org/0009-0006-3176-7572"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Guoping Long","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012597518","display_name":"Chuan Wu","orcid":"https://orcid.org/0000-0002-3144-4398"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Chuan Wu","raw_affiliation_strings":["The University of Hong Kong"],"affiliations":[{"raw_affiliation_string":"The University of Hong Kong","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100695453","display_name":"Jun Yang","orcid":"https://orcid.org/0000-0002-3365-4989"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jun Yang","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100636012","display_name":"Wei Lin","orcid":"https://orcid.org/0000-0002-3003-0150"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Wei Lin","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5110220840","display_name":"Yangqing Jia","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yangqing Jia","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100707460"],"corresponding_institution_ids":["https://openalex.org/I4210095624"],"apc_list":null,"apc_paid":null,"fwci":2.35148666,"has_fulltext":false,"cited_by_count":46,"citation_normalized_percentile":{"value":0.90866705,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"189","last_page":"202"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9955999851226807,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10764","display_name":"Privacy-Preserving Technologies in Data","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8873436450958252},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.7753366231918335},{"id":"https://openalex.org/keywords/porting","display_name":"Porting","score":0.7659517526626587},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6400120258331299},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.5941070318222046},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5903675556182861},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.5709608197212219},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.44953304529190063},{"id":"https://openalex.org/keywords/ethernet","display_name":"Ethernet","score":0.43083637952804565},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.4277365803718567},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4242074489593506},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.4035044312477112},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.38890743255615234},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.3389343023300171},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.27845126390457153},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.212968647480011},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.08997482061386108}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8873436450958252},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.7753366231918335},{"id":"https://openalex.org/C106251023","wikidata":"https://www.wikidata.org/wiki/Q851989","display_name":"Porting","level":3,"score":0.7659517526626587},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6400120258331299},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.5941070318222046},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5903675556182861},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.5709608197212219},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.44953304529190063},{"id":"https://openalex.org/C172173386","wikidata":"https://www.wikidata.org/wiki/Q79984","display_name":"Ethernet","level":2,"score":0.43083637952804565},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4277365803718567},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4242074489593506},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4035044312477112},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.38890743255615234},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3389343023300171},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.27845126390457153},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.212968647480011},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.08997482061386108}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iiswc47752.2019.9042047","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iiswc47752.2019.9042047","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE International Symposium on Workload Characterization (IISWC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":79,"referenced_works":["https://openalex.org/W854541894","https://openalex.org/W1979527452","https://openalex.org/W2016053056","https://openalex.org/W2039127023","https://openalex.org/W2060393849","https://openalex.org/W2083842231","https://openalex.org/W2119112357","https://openalex.org/W2133564696","https://openalex.org/W2143612262","https://openalex.org/W2155893237","https://openalex.org/W2156303437","https://openalex.org/W2163605009","https://openalex.org/W2181607856","https://openalex.org/W2186615578","https://openalex.org/W2257979135","https://openalex.org/W2302255633","https://openalex.org/W2309679942","https://openalex.org/W2402144811","https://openalex.org/W2407521645","https://openalex.org/W2512971201","https://openalex.org/W2515080096","https://openalex.org/W2523246573","https://openalex.org/W2528388179","https://openalex.org/W2553303224","https://openalex.org/W2606833507","https://openalex.org/W2734781747","https://openalex.org/W2752512710","https://openalex.org/W2763421725","https://openalex.org/W2764100055","https://openalex.org/W2777078856","https://openalex.org/W2786095067","https://openalex.org/W2794670651","https://openalex.org/W2803058261","https://openalex.org/W2807021761","https://openalex.org/W2809420642","https://openalex.org/W2896457183","https://openalex.org/W2899771611","https://openalex.org/W2900167092","https://openalex.org/W2901681686","https://openalex.org/W2901839763","https://openalex.org/W2903557836","https://openalex.org/W2903901007","https://openalex.org/W2924425593","https://openalex.org/W2950800384","https://openalex.org/W2951947775","https://openalex.org/W2953384591","https://openalex.org/W2962758826","https://openalex.org/W2963341956","https://openalex.org/W2963350579","https://openalex.org/W2963403868","https://openalex.org/W2963596039","https://openalex.org/W2963601856","https://openalex.org/W2963674387","https://openalex.org/W2964308564","https://openalex.org/W2964324519","https://openalex.org/W2964330541","https://openalex.org/W3004495293","https://openalex.org/W3037875189","https://openalex.org/W3100848837","https://openalex.org/W3102254513","https://openalex.org/W3105753409","https://openalex.org/W3124352525","https://openalex.org/W4289276774","https://openalex.org/W4289300273","https://openalex.org/W4302086900","https://openalex.org/W4385245566","https://openalex.org/W6623517193","https://openalex.org/W6665801690","https://openalex.org/W6682864246","https://openalex.org/W6684191040","https://openalex.org/W6686045668","https://openalex.org/W6697698479","https://openalex.org/W6698183232","https://openalex.org/W6713134421","https://openalex.org/W6714138976","https://openalex.org/W6739693220","https://openalex.org/W6739901393","https://openalex.org/W6747195836","https://openalex.org/W6752199355"],"related_works":["https://openalex.org/W2356602486","https://openalex.org/W2058965144","https://openalex.org/W2351992668","https://openalex.org/W2324828474","https://openalex.org/W2164382479","https://openalex.org/W2374315191","https://openalex.org/W2391207559","https://openalex.org/W2891987081","https://openalex.org/W1662038552","https://openalex.org/W3021568819"],"abstract_inverted_index":{"Modern":[0],"deep":[1,81],"learning":[2,82],"models":[3,28],"have":[4],"been":[5],"exploited":[6],"in":[7,91],"various":[8,105,167],"domains,":[9],"including":[10],"computer":[11],"vision":[12],"(CV),":[13],"natural":[14],"language":[15],"processing":[16],"(NLP),":[17],"search":[18],"and":[19,39,55,62,73,143,171,178,206],"recommendation.":[20],"In":[21,76],"practical":[22,47],"AI":[23,48],"clusters,":[24],"workloads":[25,84,106,133,165,187],"training":[26,66,83,109,121],"these":[27,60],"are":[29,146],"run":[30],"using":[31,107],"software":[32,71,175],"frameworks":[33],"such":[34],"as":[35],"TensorFlow,":[36],"Caffe,":[37],"PyTorch":[38],"CNTK.":[40],"One":[41],"critical":[42],"issue":[43],"for":[44,203],"efficiently":[45],"operating":[46],"clouds,":[49],"is":[50,217],"to":[51,98,111,195,222],"characterize":[52,80],"the":[53,65,69,126,148,156,164,196,200],"computing":[54,142],"data":[56],"transfer":[57],"demands":[58],"of":[59,87,104,125,155,163,185],"workloads,":[61],"more":[63],"importantly,":[64],"performance":[67,113,162],"given":[68],"underlying":[70],"framework":[72,97],"hardware":[74,179],"configurations.":[75,180],"this":[77],"paper,":[78],"we":[79],"from":[85,219],"Platform":[86],"Artificial":[88],"Intelligence":[89],"(PAI)":[90],"Alibaba.":[92],"We":[93,158,181],"establish":[94],"an":[95],"analytical":[96],"investigate":[99],"detailed":[100],"execution":[101,128],"time":[102,129],"breakdown":[103],"different":[108],"architectures,":[110],"identify":[112,182],"bottleneck.":[114],"Results":[115],"show":[116],"that":[117,183],"weight/gradient":[118],"communication":[119],"during":[120],"takes":[122],"almost":[123],"62%":[124],"total":[127],"among":[130],"all":[131],"our":[132],"on":[134,152,166,174,207],"average.":[135],"The":[136],"computation":[137],"part,":[138],"involving":[139],"both":[140],"GPU":[141,204],"memory":[144],"access,":[145],"not":[147],"biggest":[149],"bottleneck":[150],"based":[151],"collective":[153],"behavior":[154],"workloads.":[157],"further":[159],"evaluate":[160],"attainable":[161],"potential":[168],"software/hardware":[169],"mappings,":[170],"explore":[172],"implications":[173],"architecture":[176,198],"selection":[177],"60%":[184],"PS/Worker":[186],"can":[188,211],"be":[189,212],"potentially":[190],"sped":[191],"up":[192],"when":[193,214],"ported":[194],"AllReduce":[197],"exploiting":[199],"high-speed":[201],"NVLink":[202],"interconnect,":[205],"average":[208],"1.7X":[209],"speedup":[210],"achieved":[213],"Ethernet":[215],"bandwidth":[216],"upgraded":[218],"25":[220],"Gbps":[221],"100":[223],"Gbps.":[224]},"counts_by_year":[{"year":2025,"cited_by_count":9},{"year":2024,"cited_by_count":7},{"year":2023,"cited_by_count":8},{"year":2022,"cited_by_count":9},{"year":2021,"cited_by_count":7},{"year":2020,"cited_by_count":6}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
