{"id":"https://openalex.org/W4415707002","doi":"https://doi.org/10.1109/tcsi.2025.3621728","title":"WiFlow: A Precision-Scalable DNN Training Accelerator Through Winograd Algorithm and Dataflow Co-Design","display_name":"WiFlow: A Precision-Scalable DNN Training Accelerator Through Winograd Algorithm and Dataflow Co-Design","publication_year":2025,"publication_date":"2025-10-30","ids":{"openalex":"https://openalex.org/W4415707002","doi":"https://doi.org/10.1109/tcsi.2025.3621728"},"language":null,"primary_location":{"id":"doi:10.1109/tcsi.2025.3621728","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsi.2025.3621728","pdf_url":null,"source":{"id":"https://openalex.org/S116977442","display_name":"IEEE Transactions on Circuits and Systems I Regular Papers","issn_l":"1549-8328","issn":["1549-8328","1558-0806"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems I: Regular Papers","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5015500761","display_name":"Hui Wang","orcid":"https://orcid.org/0009-0002-8690-3608"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hui Wang","raw_affiliation_strings":["School of Electronic Science and Engineering, Nanjing University, Nanjing, China"],"raw_orcid":"https://orcid.org/0009-0002-8690-3608","affiliations":[{"raw_affiliation_string":"School of Electronic Science and Engineering, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jinming Lu","orcid":"https://orcid.org/0009-0007-0192-3703"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinming Lu","raw_affiliation_strings":["School of Electronic Science and Engineering, Nanjing University, Nanjing, China"],"raw_orcid":"https://orcid.org/0009-0007-0192-3703","affiliations":[{"raw_affiliation_string":"School of Electronic Science and Engineering, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120192569","display_name":"Rui Ding","orcid":null},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rui Ding","raw_affiliation_strings":["School of Electronic Science and Engineering, Nanjing University, Nanjing, China"],"raw_orcid":"https://orcid.org/0009-0005-6454-0318","affiliations":[{"raw_affiliation_string":"School of Electronic Science and Engineering, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107945960","display_name":"W. Ma","orcid":"https://orcid.org/0009-0008-2365-5284"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weize Ma","raw_affiliation_strings":["School of Electronic Science and Engineering, Nanjing University, Nanjing, China"],"raw_orcid":"https://orcid.org/0009-0008-2365-5284","affiliations":[{"raw_affiliation_string":"School of Electronic Science and Engineering, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100696999","display_name":"Zhongfeng Wang","orcid":"https://orcid.org/0000-0002-7227-4786"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhongfeng Wang","raw_affiliation_strings":["School of Electronic Science and Engineering, Nanjing University, Nanjing, China"],"raw_orcid":"https://orcid.org/0000-0002-7227-4786","affiliations":[{"raw_affiliation_string":"School of Electronic Science and Engineering, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100583800","display_name":"Jun Lin","orcid":"https://orcid.org/0009-0005-3505-4847"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jun Lin","raw_affiliation_strings":["School of Integrated Circuits, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0009-0005-3505-4847","affiliations":[{"raw_affiliation_string":"School of Integrated Circuits, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.26939747,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"73","issue":"4","first_page":"2659","last_page":"2672"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.5123999714851379,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.5123999714851379,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10714","display_name":"Software-Defined Networks and 5G","score":0.08649999648332596,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.045099999755620956,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/dataflow","display_name":"Dataflow","score":0.8263999819755554},{"id":"https://openalex.org/keywords/operand","display_name":"Operand","score":0.6826000213623047},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6191999912261963},{"id":"https://openalex.org/keywords/dram","display_name":"Dram","score":0.5044000148773193},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.49549999833106995},{"id":"https://openalex.org/keywords/efficient-energy-use","display_name":"Efficient energy use","score":0.4869000017642975},{"id":"https://openalex.org/keywords/floating-point","display_name":"Floating point","score":0.4560999870300293},{"id":"https://openalex.org/keywords/edge-device","display_name":"Edge device","score":0.42260000109672546},{"id":"https://openalex.org/keywords/application-specific-integrated-circuit","display_name":"Application-specific integrated circuit","score":0.40860000252723694},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.39169999957084656}],"concepts":[{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.8263999819755554},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8109999895095825},{"id":"https://openalex.org/C55526617","wikidata":"https://www.wikidata.org/wiki/Q719375","display_name":"Operand","level":2,"score":0.6826000213623047},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6191999912261963},{"id":"https://openalex.org/C7366592","wikidata":"https://www.wikidata.org/wiki/Q1255620","display_name":"Dram","level":2,"score":0.5044000148773193},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.49549999833106995},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.4869000017642975},{"id":"https://openalex.org/C84211073","wikidata":"https://www.wikidata.org/wiki/Q117879","display_name":"Floating point","level":2,"score":0.4560999870300293},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.4507000148296356},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.424699991941452},{"id":"https://openalex.org/C138236772","wikidata":"https://www.wikidata.org/wiki/Q25098575","display_name":"Edge device","level":3,"score":0.42260000109672546},{"id":"https://openalex.org/C77390884","wikidata":"https://www.wikidata.org/wiki/Q217302","display_name":"Application-specific integrated circuit","level":2,"score":0.40860000252723694},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.39169999957084656},{"id":"https://openalex.org/C46362747","wikidata":"https://www.wikidata.org/wiki/Q173431","display_name":"CMOS","level":2,"score":0.3720000088214874},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.3580000102519989},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.35370001196861267},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.34869998693466187},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.34290000796318054},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.3422999978065491},{"id":"https://openalex.org/C68043766","wikidata":"https://www.wikidata.org/wiki/Q267416","display_name":"Static random-access memory","level":2,"score":0.33090001344680786},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3156000077724457},{"id":"https://openalex.org/C118993495","wikidata":"https://www.wikidata.org/wiki/Q5042828","display_name":"Electrical efficiency","level":3,"score":0.3138999938964844},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.31380000710487366},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.31369999051094055},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.3102000057697296},{"id":"https://openalex.org/C186370098","wikidata":"https://www.wikidata.org/wiki/Q442787","display_name":"Energy (signal processing)","level":2,"score":0.2976999878883362},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.290800005197525},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.2863999903202057},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.28360000252723694},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.2831999957561493},{"id":"https://openalex.org/C176727019","wikidata":"https://www.wikidata.org/wiki/Q1172415","display_name":"Dataflow architecture","level":3,"score":0.2743000090122223},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.2728999853134155},{"id":"https://openalex.org/C172385210","wikidata":"https://www.wikidata.org/wiki/Q5339","display_name":"Transistor","level":3,"score":0.2727999985218048},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.2721000015735626},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.27059999108314514}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tcsi.2025.3621728","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsi.2025.3621728","pdf_url":null,"source":{"id":"https://openalex.org/S116977442","display_name":"IEEE Transactions on Circuits and Systems I Regular Papers","issn_l":"1549-8328","issn":["1549-8328","1558-0806"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems I: Regular Papers","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2250357272","display_name":null,"funder_award_id":"62174084","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2348869240","display_name":null,"funder_award_id":"BG2024032","funder_id":"https://openalex.org/F4320313574","funder_display_name":"Jiangsu Province Postdoctoral Science Foundation"},{"id":"https://openalex.org/G5354445887","display_name":null,"funder_award_id":"62341408","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320313574","display_name":"Jiangsu Province Postdoctoral Science Foundation","ror":null},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"To":[0,53],"address":[1,54],"performance":[2],"degradation":[3],"from":[4],"the":[5,70,76,98,148,153,183,268],"domain":[6],"shift":[7],"and":[8,17,39,80,94,115,117,122,132,180,187,219,232,251],"to":[9,151,176],"support":[10],"user-specific":[11],"services":[12],"while":[13],"considering":[14],"privacy,":[15],"security,":[16],"communication":[18],"overhead,":[19],"there":[20],"is":[21,50,195,202],"an":[22,86,211,220,233],"urgent":[23],"need":[24],"for":[25,30,65],"efficient":[26],"on-device":[27,89,270],"training":[28,46,90,192,265,271],"accelerators":[29],"deep":[31],"neural":[32],"networks":[33],"(DNNs).":[34],"Given":[35],"limited":[36],"computing":[37],"resources":[38],"battery":[40],"capacity":[41],"constraints,":[42],"implementing":[43],"complex":[44],"DNN":[45,264],"on":[47,197],"edge":[48],"devices":[49],"extremely":[51],"challenging.":[52],"these":[55],"issues,":[56],"we":[57,83,160],"introduce":[58],"a":[59,162,206,227],"Winograd-Integrated":[60],"Gradient":[61],"Optimization":[62,174],"Framework":[63],"(WIGOF)":[64],"cross-phase":[66],"operand":[67],"sharing":[68],"in":[69,127,262],"Winograd":[71,100,163],"domain,":[72],"which":[73],"significantly":[74],"reduces":[75],"number":[77],"of":[78,97,155,185,193,214,223,230,236],"multiplications":[79],"additions.":[81],"Additionally,":[82],"develop":[84],"WiFlow,":[85],"efficient,":[87],"precision-scalable":[88],"accelerator,":[91],"minimizing":[92],"area":[93,212,249],"power":[95],"overheads":[96],"dedicated":[99],"transformation":[101],"unit.":[102],"The":[103,190],"WiFlow":[104,194,201,239],"supports":[105],"16-bit":[106,110],"floating":[107,112],"point":[108,113,120],"(FP16),":[109],"brain":[111],"(BF16),":[114],"8":[116],"4-bit":[118],"fixed":[119],"(INT8":[121],"INT4),":[123],"demonstrating":[124],"scalable":[125],"improvements":[126],"both":[128,182],"computational":[129],"throughput":[130],"(TOPS)":[131],"energy":[133,221,260],"efficiency":[134,213,222,250,261],"(TOPS/W)":[135],"at":[136,226],"low":[137],"precision.":[138],"A":[139],"novel":[140],"data":[141,178],"rearrangement":[142],"pattern,":[143],"named":[144],"channel":[145],"augmentation,":[146],"addresses":[147],"imperfect":[149],"decomposition":[150],"enhance":[152],"utilization":[154],"processing":[156],"element":[157],"units.":[158],"Furthermore,":[159],"propose":[161],"interleaved":[164],"block-execution":[165],"dataflow":[166],"(WInBlock),":[167],"along":[168],"with":[169,205,267],"Hierarchical":[170],"Adaptive":[171],"Reuse":[172],"Memory":[173],"(HARM)":[175],"improve":[177],"reuse":[179],"reduce":[181],"amount":[184],"DRAM":[186],"SRAM":[188],"access.":[189],"end-to-end":[191],"achieved":[196],"Xilinx":[198],"XCVU440":[199],"FPGA.":[200],"also":[203],"synthesized":[204],"28nm":[207],"CMOS":[208],"technology,":[209],"achieving":[210],"624":[215],"GOPS/mm<sup":[216],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[217,242,253],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">2</sup>":[218],"4.4":[224],"TOPS/W":[225],"supply":[228],"voltage":[229],"0.9V":[231],"operating":[234],"frequency":[235],"500":[237],"MHz.":[238],"accomplishes":[240],"<inline-formula":[241,252],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">":[243,254],"<tex-math":[244,255],"notation=\"LaTeX\">$7.75\\times":[245],"$</tex-math>":[246,257],"</inline-formula>":[247,258],"higher":[248,259],"notation=\"LaTeX\">$2.91\\times":[256],"actual":[263],"compared":[266],"state-of-the-art":[269],"accelerators.":[272]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-30T00:00:00"}
