{"id":"https://openalex.org/W4318328320","doi":"https://doi.org/10.1145/3559009.3569656","title":"Parallelizing Neural Network Models Effectively on GPU by Implementing Reductions Atomically","display_name":"Parallelizing Neural Network Models Effectively on GPU by Implementing Reductions Atomically","publication_year":2022,"publication_date":"2022-10-08","ids":{"openalex":"https://openalex.org/W4318328320","doi":"https://doi.org/10.1145/3559009.3569656"},"language":"en","primary_location":{"id":"doi:10.1145/3559009.3569656","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3559009.3569656","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3559009.3569656","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference on Parallel Architectures and Compilation Techniques","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3559009.3569656","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5000546902","display_name":"Jie Zhao","orcid":"https://orcid.org/0000-0003-2303-9736"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jie Zhao","raw_affiliation_strings":["State Key Laboratory of Mathematical Engineering and Advanced Computing, Zhengzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-2303-9736","affiliations":[{"raw_affiliation_string":"State Key Laboratory of Mathematical Engineering and Advanced Computing, Zhengzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024483254","display_name":"C\u00e9dric Bastoul","orcid":"https://orcid.org/0000-0002-7164-8213"},"institutions":[{"id":"https://openalex.org/I4210123571","display_name":"Huawei Technologies (France)","ror":"https://ror.org/02rbzf697","country_code":"FR","type":"company","lineage":["https://openalex.org/I2250955327","https://openalex.org/I4210123571"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"C\u00e9dric Bastoul","raw_affiliation_strings":["Huawei Technologies France SASU, Paris, France"],"raw_orcid":"https://orcid.org/0000-0002-7164-8213","affiliations":[{"raw_affiliation_string":"Huawei Technologies France SASU, Paris, France","institution_ids":["https://openalex.org/I4210123571"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044802292","display_name":"Yanzhi Yi","orcid":"https://orcid.org/0000-0002-3486-3731"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanzhi Yi","raw_affiliation_strings":["Huawei Technologies Co., Ltd., Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-3486-3731","affiliations":[{"raw_affiliation_string":"Huawei Technologies Co., Ltd., Beijing, China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085610234","display_name":"Jiahui Hu","orcid":"https://orcid.org/0000-0002-4367-0464"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiahui Hu","raw_affiliation_strings":["Huawei Technologies Co., Ltd., Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-4367-0464","affiliations":[{"raw_affiliation_string":"Huawei Technologies Co., Ltd., Beijing, China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066629373","display_name":"Wang Nie","orcid":"https://orcid.org/0000-0001-9903-8217"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wang Nie","raw_affiliation_strings":["Huawei Technologies Co., Ltd., Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-9903-8217","affiliations":[{"raw_affiliation_string":"Huawei Technologies Co., Ltd., Beijing, China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101772672","display_name":"Renwei Zhang","orcid":"https://orcid.org/0000-0002-9744-5676"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Renwei Zhang","raw_affiliation_strings":["Huawei Technologies Co., Ltd., Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-9744-5676","affiliations":[{"raw_affiliation_string":"Huawei Technologies Co., Ltd., Beijing, China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007511354","display_name":"Zhen Geng","orcid":"https://orcid.org/0000-0003-1031-6431"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhen Geng","raw_affiliation_strings":["Huawei Technologies Co., Ltd., Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-1031-6431","affiliations":[{"raw_affiliation_string":"Huawei Technologies Co., Ltd., Hangzhou, China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051441371","display_name":"Chong Li","orcid":"https://orcid.org/0000-0002-4160-7170"},"institutions":[{"id":"https://openalex.org/I4210123571","display_name":"Huawei Technologies (France)","ror":"https://ror.org/02rbzf697","country_code":"FR","type":"company","lineage":["https://openalex.org/I2250955327","https://openalex.org/I4210123571"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Chong Li","raw_affiliation_strings":["Huawei Technologies France SASU, Paris, France"],"raw_orcid":"https://orcid.org/0000-0002-4160-7170","affiliations":[{"raw_affiliation_string":"Huawei Technologies France SASU, Paris, France","institution_ids":["https://openalex.org/I4210123571"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080257325","display_name":"Thibaut Tachon","orcid":"https://orcid.org/0000-0003-3264-5535"},"institutions":[{"id":"https://openalex.org/I4210123571","display_name":"Huawei Technologies (France)","ror":"https://ror.org/02rbzf697","country_code":"FR","type":"company","lineage":["https://openalex.org/I2250955327","https://openalex.org/I4210123571"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Thibaut Tachon","raw_affiliation_strings":["Huawei Technologies France SASU, Paris, France"],"raw_orcid":"https://orcid.org/0000-0003-3264-5535","affiliations":[{"raw_affiliation_string":"Huawei Technologies France SASU, Paris, France","institution_ids":["https://openalex.org/I4210123571"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5083785921","display_name":"Zhiliang Gan","orcid":null},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiliang Gan","raw_affiliation_strings":["Huawei Technologies Co., Ltd., Shenzhen, China"],"raw_orcid":"https://orcid.org/0000-0001-8983-4666","affiliations":[{"raw_affiliation_string":"Huawei Technologies Co., Ltd., Shenzhen, China","institution_ids":["https://openalex.org/I2250955327"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5000546902"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.2331,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.54548717,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"451","last_page":"466"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.881523072719574},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.8112013339996338},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7183486223220825},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.7002766132354736},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.688490092754364},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.584952712059021},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.5592049360275269},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.5434617400169373},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.44602763652801514},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.29514485597610474},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.2492923140525818},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.22548124194145203}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.881523072719574},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.8112013339996338},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7183486223220825},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.7002766132354736},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.688490092754364},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.584952712059021},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.5592049360275269},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.5434617400169373},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.44602763652801514},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.29514485597610474},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.2492923140525818},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.22548124194145203},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3559009.3569656","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3559009.3569656","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3559009.3569656","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference on Parallel Architectures and Compilation Techniques","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3559009.3569656","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3559009.3569656","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3559009.3569656","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference on Parallel Architectures and Compilation Techniques","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G7619151073","display_name":null,"funder_award_id":"U20A20226","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W152682494","https://openalex.org/W1191262899","https://openalex.org/W1480958225","https://openalex.org/W1558370006","https://openalex.org/W1970141743","https://openalex.org/W1997744477","https://openalex.org/W2025278624","https://openalex.org/W2034761517","https://openalex.org/W2049510645","https://openalex.org/W2055312318","https://openalex.org/W2077143534","https://openalex.org/W2082554739","https://openalex.org/W2108315152","https://openalex.org/W2122532513","https://openalex.org/W2126464777","https://openalex.org/W2135736783","https://openalex.org/W2291192259","https://openalex.org/W2296218291","https://openalex.org/W2296463846","https://openalex.org/W2475334473","https://openalex.org/W2504869045","https://openalex.org/W2515471963","https://openalex.org/W2752862883","https://openalex.org/W2789599444","https://openalex.org/W2806891462","https://openalex.org/W2925610405","https://openalex.org/W2942460556","https://openalex.org/W2949967139","https://openalex.org/W2950537538","https://openalex.org/W2963341956","https://openalex.org/W2979365412","https://openalex.org/W2982083293","https://openalex.org/W3103056109","https://openalex.org/W3104745751","https://openalex.org/W3118873362","https://openalex.org/W3177452048","https://openalex.org/W4231091366","https://openalex.org/W4246500121","https://openalex.org/W4250027548","https://openalex.org/W4250405455","https://openalex.org/W4250470790","https://openalex.org/W4254092058"],"related_works":["https://openalex.org/W2058965144","https://openalex.org/W2164382479","https://openalex.org/W2146343568","https://openalex.org/W98480971","https://openalex.org/W2150291671","https://openalex.org/W1973046741","https://openalex.org/W2983282793","https://openalex.org/W2778498407","https://openalex.org/W4252501555","https://openalex.org/W2003848320"],"abstract_inverted_index":{"Due":[0],"to":[1,64,88],"the":[2,26,49,66,80,90,121,129,138],"missing":[3],"of":[4,8,52,69,123,131,140,152],"a":[5,43,110,149,182],"good":[6],"orchestration":[7],"loop":[9,62,67],"transformations,":[10],"existing":[11],"optimizing":[12],"compilers":[13],"for":[14,48,79,143,164,179],"deploying":[15],"neural":[16,55,124],"networks":[17,56],"on":[18,39,57],"GPU":[19,116],"either":[20],"parallelize":[21],"reductions":[22,53,96],"ineffectively":[23],"or":[24],"miss":[25],"fusion":[27],"opportunities":[28],"with":[29,186],"other":[30],"operators.":[31],"Neural":[32],"network":[33,125],"models":[34,126],"thus":[35],"exhibit":[36],"sub-optimal":[37],"performance":[38,122],"GPU.":[40,58],"We":[41],"present":[42],"practical":[44],"approach":[45,167,188],"called":[46],"Panamera":[47,59,84,108],"effective":[50],"parallelization":[51],"in":[54],"first":[60],"leverages":[61],"coalescing":[63],"flatten":[65],"dimensions":[68],"reductions,":[70],"converting":[71],"all":[72],"reduction":[73],"operators":[74,145],"into":[75],"canonical":[76],"forms":[77],"eligible":[78],"polyhedral":[81,86],"model.":[82],"Next,":[83],"uses":[85],"transformations":[87],"reduce":[89],"data":[91],"movements":[92],"caused":[93],"by":[94,104,173,191],"unfused":[95],"and":[97,156,162,171,176,178,194],"perform":[98],"multi-block":[99],"hardware":[100],"binding":[101],"not":[102],"considered":[103],"many":[105],"compilers.":[106],"Finally,":[107],"embeds":[109],"highly":[111],"optimized":[112],"routine":[113],"implemented":[114],"using":[115],"atomic":[117],"instructions,":[118],"further":[119],"improving":[120],"while":[127],"guaranteeing":[128],"correctness":[130],"parallel":[132],"reductions.":[133],"The":[134],"experimental":[135],"results":[136],"demonstrate":[137],"effectiveness":[139],"our":[141,146,166,187],"approach:":[142],"single":[144],"code":[147],"obtains":[148],"mean":[150],"speedup":[151],"33.7\u00d7,":[153],"3.5\u00d7,":[154],"5.4\u00d7":[155],"9.6\u00d7":[157],"over":[158],"cuDNN,":[159,169],"CUB,":[160],"TVM":[161,170],"Ansor,":[163],"sub-graphs":[165],"outperforms":[168,189],"Ansor":[172],"9.5\u00d7,":[174],"2.6\u00d7":[175],"2.7\u00d7,":[177],"end-to-end":[180],"workloads,":[181],"tensor":[183],"compiler":[184],"integrated":[185],"them":[190],"122.5%,":[192],"19.3%":[193],"15.2%.":[195]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-05-19T21:40:30.786675","created_date":"2025-10-10T00:00:00"}
