{"id":"https://openalex.org/W3204140704","doi":"https://doi.org/10.1145/3472456.3472473","title":"Optimizing Winograd-Based Convolution with Tensor Cores","display_name":"Optimizing Winograd-Based Convolution with Tensor Cores","publication_year":2021,"publication_date":"2021-08-09","ids":{"openalex":"https://openalex.org/W3204140704","doi":"https://doi.org/10.1145/3472456.3472473","mag":"3204140704"},"language":"en","primary_location":{"id":"doi:10.1145/3472456.3472473","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3472456.3472473","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"50th International Conference on Parallel Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101402440","display_name":"Junhong Liu","orcid":"https://orcid.org/0000-0001-7596-7011"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Junhong Liu","raw_affiliation_strings":["NVIDIA, China"],"affiliations":[{"raw_affiliation_string":"NVIDIA, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103128456","display_name":"Dongxu Yang","orcid":"https://orcid.org/0000-0002-7063-1663"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dongxu Yang","raw_affiliation_strings":["NVIDIA, China"],"affiliations":[{"raw_affiliation_string":"NVIDIA, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103287856","display_name":"Junjie Lai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Junjie Lai","raw_affiliation_strings":["NVIDIA, China"],"affiliations":[{"raw_affiliation_string":"NVIDIA, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5101402440"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.6777,"has_fulltext":false,"cited_by_count":13,"citation_normalized_percentile":{"value":0.717665,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9937999844551086,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.9861999750137329,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/convolution","display_name":"Convolution (computer science)","score":0.8265514373779297},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.697077751159668},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6479524374008179},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5828620791435242},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.49344292283058167},{"id":"https://openalex.org/keywords/convolutional-code","display_name":"Convolutional code","score":0.49023228883743286},{"id":"https://openalex.org/keywords/overlap\u2013add-method","display_name":"Overlap\u2013add method","score":0.46050214767456055},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.442655086517334},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.35335278511047363},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.27781039476394653},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.2640511691570282},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.21885421872138977},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.21257373690605164},{"id":"https://openalex.org/keywords/fourier-transform","display_name":"Fourier transform","score":0.12312397360801697},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.09245097637176514},{"id":"https://openalex.org/keywords/geometry","display_name":"Geometry","score":0.08094948530197144},{"id":"https://openalex.org/keywords/mathematical-analysis","display_name":"Mathematical analysis","score":0.07733100652694702}],"concepts":[{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.8265514373779297},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.697077751159668},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6479524374008179},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5828620791435242},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.49344292283058167},{"id":"https://openalex.org/C157899210","wikidata":"https://www.wikidata.org/wiki/Q1395022","display_name":"Convolutional code","level":3,"score":0.49023228883743286},{"id":"https://openalex.org/C181002996","wikidata":"https://www.wikidata.org/wiki/Q1611641","display_name":"Overlap\u2013add method","level":5,"score":0.46050214767456055},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.442655086517334},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.35335278511047363},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.27781039476394653},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2640511691570282},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.21885421872138977},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.21257373690605164},{"id":"https://openalex.org/C102519508","wikidata":"https://www.wikidata.org/wiki/Q6520159","display_name":"Fourier transform","level":2,"score":0.12312397360801697},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.09245097637176514},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.08094948530197144},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.07733100652694702},{"id":"https://openalex.org/C76563020","wikidata":"https://www.wikidata.org/wiki/Q4817582","display_name":"Fractional Fourier transform","level":4,"score":0.0},{"id":"https://openalex.org/C203024314","wikidata":"https://www.wikidata.org/wiki/Q1365258","display_name":"Fourier analysis","level":3,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3472456.3472473","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3472456.3472473","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"50th International Conference on Parallel Processing","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.47999998927116394,"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1686810756","https://openalex.org/W1789336918","https://openalex.org/W1825672851","https://openalex.org/W1884620995","https://openalex.org/W1922123711","https://openalex.org/W2001121794","https://openalex.org/W2090593986","https://openalex.org/W2108598243","https://openalex.org/W2117130368","https://openalex.org/W2117539524","https://openalex.org/W2580538010","https://openalex.org/W2582996697","https://openalex.org/W2618530766","https://openalex.org/W2749587125","https://openalex.org/W2754249189","https://openalex.org/W2786374423","https://openalex.org/W2913668833","https://openalex.org/W2945580137","https://openalex.org/W2949245006","https://openalex.org/W2949650786","https://openalex.org/W2951894856","https://openalex.org/W2952713184","https://openalex.org/W2998957070","https://openalex.org/W3043303806","https://openalex.org/W3091804697","https://openalex.org/W3101543398","https://openalex.org/W3102337469","https://openalex.org/W4242726609","https://openalex.org/W4256309825"],"related_works":["https://openalex.org/W3157543420","https://openalex.org/W4372260258","https://openalex.org/W2267589039","https://openalex.org/W2369791303","https://openalex.org/W2759540840","https://openalex.org/W2133280289","https://openalex.org/W2360069155","https://openalex.org/W2169963286","https://openalex.org/W4254230825","https://openalex.org/W2919798019"],"abstract_inverted_index":{"Convolution":[0],"computing":[1,43,258],"is":[2,51,88,138,195,250,263],"one":[3],"of":[4,10,16,48,81,127,164,235,255],"the":[5,17,35,41,89,125,142,162,165,212,233,236,253,260],"primary":[6],"time":[7],"consuming":[8],"part":[9],"convolutional":[11,19,143],"neural":[12,20,144],"networks":[13,21],"(CNNs).":[14],"State":[15],"art":[18,166],"use":[22],"samll,":[23],"3":[24,26,60,67,85,87,108,133,185,245],"\u00d7":[25,58,61,65,68,72,76,86,106,109,131,134,183,186,243,246],"filters.":[27],"Recent":[28],"work":[29],"on":[30,114,156,200],"Winograd":[31,49,111,136,167,188,220,248],"convolution":[32,42,50,112,137,169,173,189,221,249],"can":[33],"reduce":[34],"computational":[36],"complexity":[37],"a":[38,196],"lot,":[39],"making":[40],"fast.":[44],"But":[45],"existing":[46],"implementations":[47],"limited":[52],"to":[53,140,151],"small":[54],"tiles,":[55],"i.e.":[56],"F(4":[57],"4,":[59],"3)":[62,69,110,135,187,247],"and":[63,74,84,92,153,170,223],"F(2":[64],"2,":[66],"where":[70],"4":[71,73],"2":[75,77],"are":[78],"tile":[79],"sizes":[80],"output":[82],"channels":[83],"filter":[90],"size,":[91],"single":[93],"precision":[94,104,129],"data.":[95],"In":[96],"this":[97],"paper,":[98],"we":[99,179,210],"propose":[100],"an":[101],"optimized":[102],"mixed":[103,128],"F(6":[105,130,182,242],"6,":[107,132,184,244],"implementation":[113,190],"NVIDIA":[115,157,192],"Ampere":[116,158],"GPUs":[117,201],"using":[118,217,240,256],"Tensor":[119],"Cores.":[120],"Our":[121],"experiments":[122,230],"show":[123,231],"that":[124,232],"accuracy":[126,234,254],"sufficient":[139],"infer":[141],"networks.":[145],"Besides,":[146],"our":[147,181,218,241],"method":[148],"achieves":[149],"up":[150],"15.71x":[152],"2.41x":[154],"speedup":[155],"A100,":[159],"compared":[160],"with":[161],"state":[163],"based":[168,172],"GEMM":[171],"in":[174],"cuDNN":[175],"8.1.0,":[176],"respectively.":[177],"Moreover,":[178],"integrate":[180],"into":[191],"TensorRT,":[193],"which":[194],"C++":[197],"inference":[198],"library":[199],"provided":[202],"by":[203,227],"NVIDIA,":[204],"as":[205],"custom":[206,219],"layer":[207],"plugins.":[208],"And":[209],"build":[211],"whole":[213,237],"VGG":[214,238,261],"network":[215,239,262],"model":[216],"layers":[222,225],"other":[224],"supported":[226],"TensorRT.":[228],"The":[229],"71.24%,":[251],"while":[252],"FP32":[257],"for":[259],"71.22%.":[264]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":3},{"year":2022,"cited_by_count":4}],"updated_date":"2026-03-17T09:09:15.849793","created_date":"2025-10-10T00:00:00"}
