{"id":"https://openalex.org/W4417403253","doi":"https://doi.org/10.1109/pact65351.2025.00045","title":"Optimize Winograd Convolution for a Novel MIMD Many-core Architecture PEZY-SC3s","display_name":"Optimize Winograd Convolution for a Novel MIMD Many-core Architecture PEZY-SC3s","publication_year":2025,"publication_date":"2025-11-03","ids":{"openalex":"https://openalex.org/W4417403253","doi":"https://doi.org/10.1109/pact65351.2025.00045"},"language":null,"primary_location":{"id":"doi:10.1109/pact65351.2025.00045","is_oa":false,"landing_page_url":"https://doi.org/10.1109/pact65351.2025.00045","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 34th International Conference on Parallel Architectures and Compilation Techniques (PACT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5061300320","display_name":"Yi Zhou","orcid":"https://orcid.org/0000-0003-4638-898X"},"institutions":[{"id":"https://openalex.org/I111149068","display_name":"National Defense University","ror":"https://ror.org/01nqk4x38","country_code":"US","type":"education","lineage":["https://openalex.org/I111149068"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yi Zhou","raw_affiliation_strings":["National University of Defense Technology,Laboratory of Digitizing Software for Frontier Equipment,China"],"affiliations":[{"raw_affiliation_string":"National University of Defense Technology,Laboratory of Digitizing Software for Frontier Equipment,China","institution_ids":["https://openalex.org/I111149068"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101913669","display_name":"Qinglin Wang","orcid":"https://orcid.org/0000-0002-8286-6566"},"institutions":[{"id":"https://openalex.org/I111149068","display_name":"National Defense University","ror":"https://ror.org/01nqk4x38","country_code":"US","type":"education","lineage":["https://openalex.org/I111149068"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Qinglin Wang","raw_affiliation_strings":["National University of Defense Technology,Laboratory of Digitizing Software for Frontier Equipment,China"],"affiliations":[{"raw_affiliation_string":"National University of Defense Technology,Laboratory of Digitizing Software for Frontier Equipment,China","institution_ids":["https://openalex.org/I111149068"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119011517","display_name":"Lian Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210158984","display_name":"National Supercomputing Center in Wuxi","ror":"https://ror.org/04ypjrs34","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210158984"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lian Wang","raw_affiliation_strings":["Shanxi Supercomputing Center,China"],"affiliations":[{"raw_affiliation_string":"Shanxi Supercomputing Center,China","institution_ids":["https://openalex.org/I4210158984"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085465204","display_name":"Zhiyan Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210158984","display_name":"National Supercomputing Center in Wuxi","ror":"https://ror.org/04ypjrs34","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210158984"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiyan Liu","raw_affiliation_strings":["Shanxi Supercomputing Center,China"],"affiliations":[{"raw_affiliation_string":"Shanxi Supercomputing Center,China","institution_ids":["https://openalex.org/I4210158984"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048298058","display_name":"Bingwei Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210158984","display_name":"National Supercomputing Center in Wuxi","ror":"https://ror.org/04ypjrs34","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210158984"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bingwei Wang","raw_affiliation_strings":["Shanxi Supercomputing Center,China"],"affiliations":[{"raw_affiliation_string":"Shanxi Supercomputing Center,China","institution_ids":["https://openalex.org/I4210158984"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100415356","display_name":"Feng Liu","orcid":"https://orcid.org/0009-0005-1040-2999"},"institutions":[{"id":"https://openalex.org/I4210158984","display_name":"National Supercomputing Center in Wuxi","ror":"https://ror.org/04ypjrs34","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210158984"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Feiming Liu","raw_affiliation_strings":["Shanxi Supercomputing Center,China"],"affiliations":[{"raw_affiliation_string":"Shanxi Supercomputing Center,China","institution_ids":["https://openalex.org/I4210158984"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057280984","display_name":"Xiangdong Pei","orcid":null},"institutions":[{"id":"https://openalex.org/I4210158984","display_name":"National Supercomputing Center in Wuxi","ror":"https://ror.org/04ypjrs34","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210158984"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiangdong Pei","raw_affiliation_strings":["Shanxi Supercomputing Center,China"],"affiliations":[{"raw_affiliation_string":"Shanxi Supercomputing Center,China","institution_ids":["https://openalex.org/I4210158984"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5052226742","display_name":"Jie Liu","orcid":"https://orcid.org/0000-0002-9297-7729"},"institutions":[{"id":"https://openalex.org/I111149068","display_name":"National Defense University","ror":"https://ror.org/01nqk4x38","country_code":"US","type":"education","lineage":["https://openalex.org/I111149068"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jie Liu","raw_affiliation_strings":["National University of Defense Technology,Laboratory of Digitizing Software for Frontier Equipment,China"],"affiliations":[{"raw_affiliation_string":"National University of Defense Technology,Laboratory of Digitizing Software for Frontier Equipment,China","institution_ids":["https://openalex.org/I111149068"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5061300320"],"corresponding_institution_ids":["https://openalex.org/I111149068"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.40858817,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"431","last_page":"443"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.7633000016212463,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.7633000016212463,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.03709999844431877,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.033399999141693115,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/mimd","display_name":"MIMD","score":0.8521000146865845},{"id":"https://openalex.org/keywords/convolution","display_name":"Convolution (computer science)","score":0.7038999795913696},{"id":"https://openalex.org/keywords/efficient-energy-use","display_name":"Efficient energy use","score":0.4424999952316284},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.4223000109195709},{"id":"https://openalex.org/keywords/simd","display_name":"SIMD","score":0.3901999890804291},{"id":"https://openalex.org/keywords/computational-complexity-theory","display_name":"Computational complexity theory","score":0.3849000036716461},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.3799000084400177},{"id":"https://openalex.org/keywords/parallelism","display_name":"Parallelism (grammar)","score":0.37950000166893005}],"concepts":[{"id":"https://openalex.org/C21032095","wikidata":"https://www.wikidata.org/wiki/Q1149237","display_name":"MIMD","level":2,"score":0.8521000146865845},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7872999906539917},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7239000201225281},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.7038999795913696},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.4424999952316284},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4323999881744385},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.4223000109195709},{"id":"https://openalex.org/C150552126","wikidata":"https://www.wikidata.org/wiki/Q339387","display_name":"SIMD","level":2,"score":0.3901999890804291},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.3849000036716461},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.3799000084400177},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.37950000166893005},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.37220001220703125},{"id":"https://openalex.org/C96972482","wikidata":"https://www.wikidata.org/wiki/Q1049168","display_name":"Xeon Phi","level":2,"score":0.36579999327659607},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.33550000190734863},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.31459999084472656},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.3140999972820282},{"id":"https://openalex.org/C145108525","wikidata":"https://www.wikidata.org/wiki/Q656154","display_name":"Xeon","level":2,"score":0.29580000042915344},{"id":"https://openalex.org/C75172450","wikidata":"https://www.wikidata.org/wiki/Q623950","display_name":"Fast Fourier transform","level":2,"score":0.2922999858856201},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2831999957561493},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.2721000015735626},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.26100000739097595},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.25940001010894775}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/pact65351.2025.00045","is_oa":false,"landing_page_url":"https://doi.org/10.1109/pact65351.2025.00045","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 34th International Conference on Parallel Architectures and Compilation Techniques (PACT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":40,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W1487564550","https://openalex.org/W1993508117","https://openalex.org/W2002555321","https://openalex.org/W2172654076","https://openalex.org/W2194775991","https://openalex.org/W2252007067","https://openalex.org/W2516525699","https://openalex.org/W2605739168","https://openalex.org/W2618530766","https://openalex.org/W2749587125","https://openalex.org/W2786374423","https://openalex.org/W2924040443","https://openalex.org/W2958885886","https://openalex.org/W2963673357","https://openalex.org/W2965571038","https://openalex.org/W2988926770","https://openalex.org/W2998957070","https://openalex.org/W3008378296","https://openalex.org/W3057502114","https://openalex.org/W3129601694","https://openalex.org/W3143841271","https://openalex.org/W3170836278","https://openalex.org/W3182270196","https://openalex.org/W3194710734","https://openalex.org/W3202294838","https://openalex.org/W3203564148","https://openalex.org/W3204140704","https://openalex.org/W3204278478","https://openalex.org/W3210601829","https://openalex.org/W4200278572","https://openalex.org/W4243682116","https://openalex.org/W4244254628","https://openalex.org/W4391387876","https://openalex.org/W4392306437","https://openalex.org/W4401408852","https://openalex.org/W4405390006","https://openalex.org/W4405516673","https://openalex.org/W4407603895","https://openalex.org/W4407604104"],"related_works":[],"abstract_inverted_index":{"Optimizing":[0],"convolution":[1,16,31,36,60,85,91,121,136],"operations":[2,61,86],"is":[3],"critical":[4],"for":[5,19,42,59,135],"enhancing":[6],"the":[7,34,119],"performance":[8,83,108],"of":[9,106,199],"convolutional":[10,141],"neural":[11],"networks":[12],"(CNNs).":[13],"The":[14,168],"Winograd":[15,35,120,170],"algorithm,":[17],"renowned":[18],"its":[20,133],"significant":[21],"reduction":[22],"in":[23,30,84,154,165],"computational":[24,163,175],"complexity,":[25],"has":[26,38],"been":[27,39],"widely":[28,145],"adopted":[29,146],"acceleration.":[32],"While":[33],"algorithm":[37,122,171],"highly":[40],"optimized":[41,169],"traditional":[43],"computing":[44],"platforms":[45],"such":[46],"as":[47],"SIMD-based":[48],"CPUs":[49],"and":[50,71,82,109,117,160,185,202],"SIMT-based":[51],"GPUs,":[52],"research":[53],"on":[54,123,181,188],"alternative":[55],"architectures":[56],"better":[57],"suited":[58],"remains":[62],"ongoing.":[63],"MIMD":[64,126],"architectures,":[65],"characterized":[66],"by":[67],"their":[68,80],"high":[69],"parallelism":[70],"thread":[72],"divergence":[73],"mitigation":[74],"capabilities,":[75],"present":[76],"promising":[77],"potential;":[78],"nevertheless,":[79],"applicability":[81],"remain":[87],"underexplored.":[88],"Additionally,":[89],"GPU-based":[90],"acceleration,":[92],"despite":[93],"delivering":[94],"exceptional":[95],"performance,":[96],"faces":[97],"escalating":[98],"energy":[99,110,195],"consumption":[100],"challenges,":[101],"necessitating":[102],"a":[103,124],"balanced":[104],"optimization":[105],"hardware":[107],"efficiency.":[111],"To":[112],"address":[113],"these,":[114],"we":[115],"implement":[116],"optimize":[118],"low-power":[125],"many-core":[127],"processor":[128],"PEZY-SC3s,":[129],"aiming":[130],"to":[131,178],"investigate":[132],"viability":[134],"workloads.":[137],"Experimental":[138],"evaluations":[139],"using":[140],"layer":[142],"parameters":[143],"from":[144],"CNNs":[147],"demonstrate:":[148],"1)":[149],"78.37%":[150],"average":[151],"bandwidth":[152],"utilization":[153],"data-intensive":[155],"stages;":[156],"2)":[157],"98%":[158],"single-core":[159],"92.52%":[161],"system-wide":[162],"efficiency":[164,176,196],"compute-intensive":[166],"phases.":[167],"achieves":[172],"significantly":[173],"superior":[174],"compared":[177],"cuDNN-based":[179],"implementation":[180,187],"Nvidia":[182],"A100":[183],"GPU":[184],"oneDNN-based":[186],"Intel":[189],"Xeon":[190],"Silver":[191],"4314":[192],"CPU,":[193],"with":[194],"ratios":[197],"(GFLOPS/W)":[198],"$2.58":[200],"\\times$":[201],"$27.52":[203],"\\times$,":[204],"respectively.":[205]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-12-16T00:00:00"}
