{"id":"https://openalex.org/W3091804697","doi":"https://doi.org/10.1109/dac18072.2020.9218566","title":"GPNPU: Enabling Efficient Hardware-Based Direct Convolution with Multi-Precision Support in GPU Tensor Cores","display_name":"GPNPU: Enabling Efficient Hardware-Based Direct Convolution with Multi-Precision Support in GPU Tensor Cores","publication_year":2020,"publication_date":"2020-07-01","ids":{"openalex":"https://openalex.org/W3091804697","doi":"https://doi.org/10.1109/dac18072.2020.9218566","mag":"3091804697"},"language":"en","primary_location":{"id":"doi:10.1109/dac18072.2020.9218566","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac18072.2020.9218566","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2020 57th ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5067161373","display_name":"Zhuoran Song","orcid":"https://orcid.org/0000-0002-6494-4786"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhuoran Song","raw_affiliation_strings":["Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100652312","display_name":"Jianfei Wang","orcid":"https://orcid.org/0000-0003-0942-518X"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianfei Wang","raw_affiliation_strings":["Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023632642","display_name":"Tianjian Li","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tianjian Li","raw_affiliation_strings":["Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053801300","display_name":"Li Jiang","orcid":"https://orcid.org/0000-0002-7353-8798"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Li Jiang","raw_affiliation_strings":["Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100622704","display_name":"Jing Ke","orcid":"https://orcid.org/0000-0001-7459-257X"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jing Ke","raw_affiliation_strings":["Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056632010","display_name":"Xiaoyao Liang","orcid":"https://orcid.org/0000-0002-2790-5884"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoyao Liang","raw_affiliation_strings":["Biren Research","Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"Biren Research","institution_ids":[]},{"raw_affiliation_string":"Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5045693138","display_name":"Naifeng Jing","orcid":"https://orcid.org/0000-0001-8417-5796"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Naifeng Jing","raw_affiliation_strings":["Shanghai Jiao Tong University"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5067161373"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.7816,"has_fulltext":false,"cited_by_count":12,"citation_normalized_percentile":{"value":0.7454964,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":93,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9941999912261963,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7977719902992249},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.642788827419281},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6278937458992004},{"id":"https://openalex.org/keywords/convolution","display_name":"Convolution (computer science)","score":0.5723773241043091},{"id":"https://openalex.org/keywords/dataflow","display_name":"Dataflow","score":0.553459107875824},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.5185302495956421},{"id":"https://openalex.org/keywords/acceleration","display_name":"Acceleration","score":0.5104185938835144},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.48213669657707214},{"id":"https://openalex.org/keywords/multi-core-processor","display_name":"Multi-core processor","score":0.4752250015735626},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.4713073670864105},{"id":"https://openalex.org/keywords/flops","display_name":"FLOPS","score":0.4696226418018341},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.46047890186309814},{"id":"https://openalex.org/keywords/graphics-processing-unit","display_name":"Graphics processing unit","score":0.4261443614959717},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.27229544520378113},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.08768898248672485}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7977719902992249},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.642788827419281},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6278937458992004},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.5723773241043091},{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.553459107875824},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.5185302495956421},{"id":"https://openalex.org/C117896860","wikidata":"https://www.wikidata.org/wiki/Q11376","display_name":"Acceleration","level":2,"score":0.5104185938835144},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.48213669657707214},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.4752250015735626},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.4713073670864105},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.4696226418018341},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.46047890186309814},{"id":"https://openalex.org/C2779851693","wikidata":"https://www.wikidata.org/wiki/Q183484","display_name":"Graphics processing unit","level":2,"score":0.4261443614959717},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.27229544520378113},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.08768898248672485},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.0},{"id":"https://openalex.org/C74650414","wikidata":"https://www.wikidata.org/wiki/Q11397","display_name":"Classical mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/dac18072.2020.9218566","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac18072.2020.9218566","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2020 57th ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.8399999737739563,"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W1667652561","https://openalex.org/W1841592590","https://openalex.org/W1999085092","https://openalex.org/W2009832130","https://openalex.org/W2044535169","https://openalex.org/W2067523571","https://openalex.org/W2094756095","https://openalex.org/W2117696986","https://openalex.org/W2521727659","https://openalex.org/W2604319603","https://openalex.org/W2606722458","https://openalex.org/W2624789836","https://openalex.org/W2754249189","https://openalex.org/W2899176839","https://openalex.org/W2900766348","https://openalex.org/W2945580137","https://openalex.org/W2953212265","https://openalex.org/W2963367920","https://openalex.org/W2963374099","https://openalex.org/W2963989532","https://openalex.org/W3004171485","https://openalex.org/W4212788319","https://openalex.org/W4247470470","https://openalex.org/W4302296459","https://openalex.org/W6637151318","https://openalex.org/W6638783484","https://openalex.org/W6677829916","https://openalex.org/W6739407111","https://openalex.org/W6744651773","https://openalex.org/W6756533898"],"related_works":["https://openalex.org/W2293118914","https://openalex.org/W2998381397","https://openalex.org/W4236419692","https://openalex.org/W3167919718","https://openalex.org/W4251718783","https://openalex.org/W2171015181","https://openalex.org/W1970548269","https://openalex.org/W2380696053","https://openalex.org/W2119413962","https://openalex.org/W2060611139"],"abstract_inverted_index":{"To":[0],"tailor":[1],"for":[2,114,132,137,164,173],"DNN":[3,166,179],"(Deep":[4],"Neural":[5],"Network)":[6],"acceleration,":[7,167],"GPU":[8,171],"has":[9],"migrated":[10],"to":[11,78,86,107,148,159,170],"new":[12],"architectures":[13],"such":[14],"as":[15,98],"NVIDIA":[16],"Volta":[17],"and":[18,76,90,112,135,157],"Turing":[19],"that":[20,121,153],"incorporate":[21],"dedicated":[22],"Tensor":[23,32,70,125,162],"Cores.":[24],"Although":[25],"good":[26],"at":[27],"GEMM":[28],"(generic":[29],"matrix-matrix":[30],"multiplication),":[31],"Cores":[33,71,126,163],"still":[34],"have":[35],"inefficiency":[36],"facing":[37],"convolutions":[38],"with":[39,72,82,139],"certain":[40],"layer":[41],"structures.":[42],"This":[43],"paper":[44],"proposes":[45],"a":[46,103],"GPNPU":[47,122],"(General-Purpose":[48],"Neural-network":[49],"Processing":[50],"Unit)":[51],"architecture,":[52],"which":[53],"offers":[54],"another":[55],"option":[56],"of":[57],"direct":[58,65],"convolution":[59,66,84],"in":[60,177],"GPU.":[61,99],"It":[62],"stitches":[63],"the":[64,69,95,161,174],"dataflow":[67],"into":[68],"little":[73],"hardware":[74],"support,":[75],"resorts":[77],"regulated":[79],"data":[80],"layout":[81],"stripe-mined":[83],"execution":[85],"achieve":[87],"higher":[88,115],"performance":[89,145],"power":[91],"efficiency,":[92],"while":[93,168],"retaining":[94],"general":[96],"programability":[97],"We":[100],"further":[101],"apply":[102],"unified":[104],"core":[105],"design":[106],"support":[108],"varied":[109],"operand":[110],"types":[111],"precision":[113],"computing":[116],"throughput.":[117],"The":[118,143],"evaluation":[119],"shows":[120],"can":[123],"outperform":[124],"on":[127],"typical":[128],"DNNs":[129],"by":[130],"1.4X":[131],"inference":[133],"(FP16)":[134],"1.2X":[136],"training":[138],"much":[140],"reduced":[141],"power.":[142],"INT8":[144],"even":[146],"increases":[147],"2.4X.":[149],"Our":[150],"study":[151],"demonstrates":[152],"it":[154],"is":[155],"possible":[156],"appealing":[158],"refine":[160],"greater":[165],"conforming":[169],"architecture":[172],"programmability":[175],"necessary":[176],"future":[178],"evolution.":[180]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
