{"id":"https://openalex.org/W7116288485","doi":"https://doi.org/10.1145/3754598.3754599","title":"WinRS: Accelerate Winograd Backward-Filter Convolution with Tiny Workspace","display_name":"WinRS: Accelerate Winograd Backward-Filter Convolution with Tiny Workspace","publication_year":2025,"publication_date":"2025-09-08","ids":{"openalex":"https://openalex.org/W7116288485","doi":"https://doi.org/10.1145/3754598.3754599"},"language":null,"primary_location":{"id":"doi:10.1145/3754598.3754599","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754599","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3754598.3754599","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zhiyi Zhang","orcid":"https://orcid.org/0009-0006-7849-3222"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhiyi Zhang","raw_affiliation_strings":["University of Science and Technology of China, Hefei, Anhi, China"],"raw_orcid":"https://orcid.org/0009-0006-7849-3222","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, Hefei, Anhi, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120916874","display_name":"Junshi Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junshi Chen","raw_affiliation_strings":["University of Science and Technology of China, Hefei, Anhi, China"],"raw_orcid":"https://orcid.org/0000-0002-6487-3658","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, Hefei, Anhi, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jingwei Sun","orcid":"https://orcid.org/0000-0001-5098-1503"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingwei Sun","raw_affiliation_strings":["University of Science and Technology of China, Hefei, Anhi, China"],"raw_orcid":"https://orcid.org/0000-0001-5098-1503","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, Hefei, Anhi, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120941884","display_name":"Pengfei Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I2802624667","display_name":"Hefei Institutes of Physical Science","ror":"https://ror.org/046n57345","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I2802624667"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pengfei Zhang","raw_affiliation_strings":["Hefei Institutes of Physical Science, Chinese Academy of Sciences, Hefei, Anhi, China"],"raw_orcid":"https://orcid.org/0000-0001-5415-9592","affiliations":[{"raw_affiliation_string":"Hefei Institutes of Physical Science, Chinese Academy of Sciences, Hefei, Anhi, China","institution_ids":["https://openalex.org/I2802624667"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049138286","display_name":"Zhuopin Xu","orcid":"https://orcid.org/0000-0002-1629-5988"},"institutions":[{"id":"https://openalex.org/I2802624667","display_name":"Hefei Institutes of Physical Science","ror":"https://ror.org/046n57345","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I2802624667"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhuopin Xu","raw_affiliation_strings":["Hefei Institutes of Physical Science, Chinese Academy of Sciences, Hefei, Anhi, China"],"raw_orcid":"https://orcid.org/0000-0002-1629-5988","affiliations":[{"raw_affiliation_string":"Hefei Institutes of Physical Science, Chinese Academy of Sciences, Hefei, Anhi, China","institution_ids":["https://openalex.org/I2802624667"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120878069","display_name":"Jun Shi","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jun Shi","raw_affiliation_strings":["University of Science and Technology of China, Hefei, Anhi, China"],"raw_orcid":"https://orcid.org/0000-0002-9888-6238","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, Hefei, Anhi, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5120895369","display_name":"Qi Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I2802624667","display_name":"Hefei Institutes of Physical Science","ror":"https://ror.org/046n57345","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I2802624667"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qi Wang","raw_affiliation_strings":["Hefei Institutes of Physical Science, Chinese Academy of Sciences, Hefei, Anhi, China"],"raw_orcid":"https://orcid.org/0000-0002-5810-9223","affiliations":[{"raw_affiliation_string":"Hefei Institutes of Physical Science, Chinese Academy of Sciences, Hefei, Anhi, China","institution_ids":["https://openalex.org/I2802624667"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.60328698,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"115","last_page":"124"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.47049999237060547,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.47049999237060547,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11019","display_name":"Image Enhancement Techniques","score":0.1307000070810318,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11697","display_name":"Numerical Methods and Algorithms","score":0.030500000342726707,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.7200000286102295},{"id":"https://openalex.org/keywords/convolution","display_name":"Convolution (computer science)","score":0.66839998960495},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6481000185012817},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.6385999917984009},{"id":"https://openalex.org/keywords/fast-fourier-transform","display_name":"Fast Fourier transform","score":0.5946999788284302},{"id":"https://openalex.org/keywords/workspace","display_name":"Workspace","score":0.5214999914169312},{"id":"https://openalex.org/keywords/acceleration","display_name":"Acceleration","score":0.49219998717308044},{"id":"https://openalex.org/keywords/porting","display_name":"Porting","score":0.44609999656677246}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7986000180244446},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.7200000286102295},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.66839998960495},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6481000185012817},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.6385999917984009},{"id":"https://openalex.org/C75172450","wikidata":"https://www.wikidata.org/wiki/Q623950","display_name":"Fast Fourier transform","level":2,"score":0.5946999788284302},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5401999950408936},{"id":"https://openalex.org/C58581272","wikidata":"https://www.wikidata.org/wiki/Q12741163","display_name":"Workspace","level":3,"score":0.5214999914169312},{"id":"https://openalex.org/C117896860","wikidata":"https://www.wikidata.org/wiki/Q11376","display_name":"Acceleration","level":2,"score":0.49219998717308044},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4510999917984009},{"id":"https://openalex.org/C106251023","wikidata":"https://www.wikidata.org/wiki/Q851989","display_name":"Porting","level":3,"score":0.44609999656677246},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.4359000027179718},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.41119998693466187},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.40939998626708984},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.3718999922275543},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.33329999446868896},{"id":"https://openalex.org/C149810388","wikidata":"https://www.wikidata.org/wiki/Q5374873","display_name":"Emulation","level":2,"score":0.3310999870300293},{"id":"https://openalex.org/C137800194","wikidata":"https://www.wikidata.org/wiki/Q11713455","display_name":"Interpolation (computer graphics)","level":3,"score":0.3253999948501587},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.30300000309944153},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.296099990606308},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.2757999897003174},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.25679999589920044},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.25450000166893005},{"id":"https://openalex.org/C106515295","wikidata":"https://www.wikidata.org/wiki/Q26806595","display_name":"Parallel processing","level":2,"score":0.2515999972820282},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3754598.3754599","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754599","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3754598.3754599","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754599","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/7","score":0.42244836688041687,"display_name":"Affordable and clean energy"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W1487564550","https://openalex.org/W2098220211","https://openalex.org/W2108598243","https://openalex.org/W2172654076","https://openalex.org/W2194775991","https://openalex.org/W2752782242","https://openalex.org/W2786374423","https://openalex.org/W2913221350","https://openalex.org/W2964525696","https://openalex.org/W2977634443","https://openalex.org/W2998957070","https://openalex.org/W3000160544","https://openalex.org/W3008378296","https://openalex.org/W3013525232","https://openalex.org/W3015869703","https://openalex.org/W3194710734","https://openalex.org/W3202294838","https://openalex.org/W3204140704","https://openalex.org/W3217153855","https://openalex.org/W4224310326","https://openalex.org/W4312443924","https://openalex.org/W4321488033","https://openalex.org/W4388893857","https://openalex.org/W4389459222","https://openalex.org/W4401408852","https://openalex.org/W4402775306"],"related_works":[],"abstract_inverted_index":{"Winograd":[0],"algorithm":[1],"powerfully":[2],"accelerates":[3],"Convolutional":[4],"Neural":[5],"Networks.":[6],"However,":[7],"for":[8],"backward-filter":[9],"convolution":[10],"(BFC),":[11],"existing":[12],"implementations":[13],"often":[14],"struggle":[15],"to":[16,26,56,99,120],"achieve":[17],"both":[18],"high":[19],"throughput":[20,108,144],"and":[21,31,40,52,139,141,147],"low":[22],"memory":[23,68],"usage,":[24],"due":[25],"challenges":[27],"from":[28],"large":[29,47],"filters":[30,48],"small":[32],"outputs.":[33],"We":[34],"propose":[35],"WinRS,":[36],"a":[37],"fast,":[38],"memory-efficient,":[39],"flexible":[41],"BFC":[42,65],"algorithm.":[43],"WinRS":[44,80,104,116,130],"reduces":[45],"N-D":[46],"into":[49,84],"1D":[50,78],"formats":[51],"precisely":[53],"splits":[54],"them":[55],"match":[57],"the":[58,73],"fastest":[59],"kernels.":[60],"These":[61],"fully-fused":[62],"kernels":[63],"execute":[64],"in":[66,94],"on-chip":[67],"with":[69,145],"tiny":[70],"workspace,":[71],"leveraging":[72],"superior":[74],"acceleration":[75],"potential":[76],"of":[77,88,109,136],"Winograd.":[79],"adaptively":[81],"balances":[82],"workloads":[83],"an":[85],"optimal":[86],"number":[87],"block":[89],"groups,":[90],"maximizing":[91],"hardware":[92],"utilization":[93],"small-output":[95],"cases.":[96],"When":[97],"ported":[98],"FP16":[100],"on":[101],"Tensor":[102],"Cores,":[103],"achieves":[105,117],"3.27":[106],"\u00d7":[107,119,122],"its":[110],"FP32":[111],"CUDA-Core":[112],"version.":[113],"In":[114],"experiments,":[115],"1.05":[118],"4.7":[121],"speedup":[123],"over":[124],"cuDNN":[125,137],"GEMM":[126],"using":[127],"comparable":[128],"workspace;":[129],"uses":[131],"less":[132],"than":[133],"4%":[134],"workspace":[135],"FFT":[138],"Winograd,":[140],"exhibits":[142],"higher":[143],"memory-":[146],"FLOP-bound":[148],"workloads.":[149]},"counts_by_year":[],"updated_date":"2025-12-21T02:06:08.432651","created_date":"2025-12-21T00:00:00"}
