{"id":"https://openalex.org/W4389162736","doi":"https://doi.org/10.1109/iccad57390.2023.10323752","title":"FET-OPU: A Flexible and Efficient FPGA-Based Overlay Processor for Transformer Networks","display_name":"FET-OPU: A Flexible and Efficient FPGA-Based Overlay Processor for Transformer Networks","publication_year":2023,"publication_date":"2023-10-28","ids":{"openalex":"https://openalex.org/W4389162736","doi":"https://doi.org/10.1109/iccad57390.2023.10323752"},"language":"en","primary_location":{"id":"doi:10.1109/iccad57390.2023.10323752","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccad57390.2023.10323752","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE/ACM International Conference on Computer Aided Design (ICCAD)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100580160","display_name":"Yueyin Bai","orcid":"https://orcid.org/0009-0006-2938-5153"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yueyin Bai","raw_affiliation_strings":["Fudan University,State Key Lab of ASIC &#x0026; System,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Fudan University,State Key Lab of ASIC &#x0026; System,Shanghai,China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102009223","display_name":"Hao Zhou","orcid":"https://orcid.org/0000-0003-0642-881X"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Zhou","raw_affiliation_strings":["Fudan University,State Key Lab of ASIC &#x0026; System,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Fudan University,State Key Lab of ASIC &#x0026; System,Shanghai,China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027433777","display_name":"Keqing Zhao","orcid":"https://orcid.org/0009-0001-4951-4059"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Keqing Zhao","raw_affiliation_strings":["Fudan University,State Key Lab of ASIC &#x0026; System,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Fudan University,State Key Lab of ASIC &#x0026; System,Shanghai,China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101406825","display_name":"Hongji Wang","orcid":"https://orcid.org/0000-0002-8834-1592"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongji Wang","raw_affiliation_strings":["Fudan University,State Key Lab of ASIC &#x0026; System,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Fudan University,State Key Lab of ASIC &#x0026; System,Shanghai,China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101854580","display_name":"Jianli Chen","orcid":"https://orcid.org/0000-0002-1391-2696"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianli Chen","raw_affiliation_strings":["Fudan University,State Key Lab of ASIC &#x0026; System,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Fudan University,State Key Lab of ASIC &#x0026; System,Shanghai,China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103098516","display_name":"Jun Yu","orcid":"https://orcid.org/0000-0003-4286-9292"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jun Yu","raw_affiliation_strings":["Fudan University,State Key Lab of ASIC &#x0026; System,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Fudan University,State Key Lab of ASIC &#x0026; System,Shanghai,China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100366737","display_name":"Kun Wang","orcid":"https://orcid.org/0000-0003-4180-6150"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kun Wang","raw_affiliation_strings":["Fudan University,State Key Lab of ASIC &#x0026; System,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Fudan University,State Key Lab of ASIC &#x0026; System,Shanghai,China","institution_ids":["https://openalex.org/I24943067"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100580160"],"corresponding_institution_ids":["https://openalex.org/I24943067"],"apc_list":null,"apc_paid":null,"fwci":3.4106,"has_fulltext":false,"cited_by_count":17,"citation_normalized_percentile":{"value":0.93144799,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"9"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9868999719619751,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9868999719619751,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9769999980926514,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10223","display_name":"Microgrid Control and Optimization","score":0.9688000082969666,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8457913398742676},{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.671355664730072},{"id":"https://openalex.org/keywords/energy-consumption","display_name":"Energy consumption","score":0.5321615934371948},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5073191523551941},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.48584529757499695},{"id":"https://openalex.org/keywords/application-specific-integrated-circuit","display_name":"Application-specific integrated circuit","score":0.4801199436187744},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.4643917679786682},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.46315211057662964},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.45302221179008484},{"id":"https://openalex.org/keywords/overlay","display_name":"Overlay","score":0.44966474175453186},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.4457446336746216},{"id":"https://openalex.org/keywords/schedule","display_name":"Schedule","score":0.4364810585975647},{"id":"https://openalex.org/keywords/gate-array","display_name":"Gate array","score":0.4130803346633911},{"id":"https://openalex.org/keywords/computer-hardware","display_name":"Computer hardware","score":0.41241690516471863},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.11933934688568115}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8457913398742676},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.671355664730072},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.5321615934371948},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5073191523551941},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.48584529757499695},{"id":"https://openalex.org/C77390884","wikidata":"https://www.wikidata.org/wiki/Q217302","display_name":"Application-specific integrated circuit","level":2,"score":0.4801199436187744},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.4643917679786682},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.46315211057662964},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.45302221179008484},{"id":"https://openalex.org/C136085584","wikidata":"https://www.wikidata.org/wiki/Q910289","display_name":"Overlay","level":2,"score":0.44966474175453186},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.4457446336746216},{"id":"https://openalex.org/C68387754","wikidata":"https://www.wikidata.org/wiki/Q7271585","display_name":"Schedule","level":2,"score":0.4364810585975647},{"id":"https://openalex.org/C114237110","wikidata":"https://www.wikidata.org/wiki/Q114901","display_name":"Gate array","level":3,"score":0.4130803346633911},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.41241690516471863},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.11933934688568115},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C18903297","wikidata":"https://www.wikidata.org/wiki/Q7150","display_name":"Ecology","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iccad57390.2023.10323752","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccad57390.2023.10323752","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE/ACM International Conference on Computer Aided Design (ICCAD)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.9100000262260437,"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W2613938013","https://openalex.org/W2798341898","https://openalex.org/W2798515322","https://openalex.org/W2804032941","https://openalex.org/W2896457183","https://openalex.org/W2965373594","https://openalex.org/W2979455536","https://openalex.org/W3008408165","https://openalex.org/W3017024317","https://openalex.org/W3034100311","https://openalex.org/W3043675702","https://openalex.org/W3047848469","https://openalex.org/W3094502228","https://openalex.org/W3130240120","https://openalex.org/W3155487259","https://openalex.org/W3184454880","https://openalex.org/W3189877953","https://openalex.org/W3206749822","https://openalex.org/W3209892756","https://openalex.org/W4287025617","https://openalex.org/W4288337707","https://openalex.org/W4293025109","https://openalex.org/W4313467238","https://openalex.org/W4319870564","https://openalex.org/W4321232185","https://openalex.org/W4383749557","https://openalex.org/W6751109714","https://openalex.org/W6751349269","https://openalex.org/W6755207826","https://openalex.org/W6762805369","https://openalex.org/W6799370299"],"related_works":["https://openalex.org/W2058965144","https://openalex.org/W2164382479","https://openalex.org/W2146343568","https://openalex.org/W98480971","https://openalex.org/W2150291671","https://openalex.org/W2013643406","https://openalex.org/W2027972911","https://openalex.org/W2157978810","https://openalex.org/W2778498407","https://openalex.org/W4391267927"],"abstract_inverted_index":{"There":[0],"are":[1],"already":[2],"some":[3],"works":[4],"on":[5,19,88,135],"accelerating":[6],"transformer":[7,69],"networks":[8],"with":[9,169,175,181,195,207],"field-programmable":[10],"gate":[11],"array":[12,86,95],"(FPGA).":[13],"However,":[14],"many":[15],"accelerators":[16,199],"focus":[17],"only":[18],"attention":[20],"computation":[21],"or":[22],"suffer":[23],"from":[24],"fixed":[25],"data":[26,97,130],"streams":[27],"without":[28,36],"flexibility.":[29],"Moreover,":[30],"their":[31],"hardware":[32,43],"performance":[33],"is":[34,107],"limited":[35],"schedule":[37],"optimization":[38],"and":[39,52,92,99,146,150,163,171,189,200],"full":[40],"use":[41],"of":[42,68,81],"resources.":[44],"In":[45,139],"this":[46],"article,":[47],"we":[48,60,141,184],"propose":[49,72],"a":[50,73,82,93,143],"flexible":[51],"efficient":[53,102],"FPGA-based":[54],"overlay":[55,63,126],"processor,":[56],"named":[57],"FET-OPU.":[58],"Specifically,":[59],"design":[61],"an":[62,121],"architecture":[64],"for":[65,96,124],"general":[66],"accelerations":[67],"networks.":[70],"We":[71,118],"unique":[74],"matrix":[75],"multiplication":[76],"unit":[77,105],"(MMU),":[78],"which":[79,110],"consists":[80],"processing":[83],"element":[84],"(PE)":[85],"based":[87],"modified":[89],"DSP-packing":[90],"technology":[91],"FIFO":[94],"caching":[98],"rearrangement.":[100],"An":[101],"non-linear":[103,116],"function":[104],"(NFU)":[106],"also":[108,119],"introduced,":[109],"can":[111,201],"calculate":[112],"arbitrary":[113],"single":[114],"input":[115],"functions.":[117],"customize":[120],"instruction":[122],"set":[123],"our":[125,158],"architecture,":[127],"dynamically":[128],"controlling":[129],"flows":[131],"by":[132],"instructions":[133],"generated":[134],"the":[136,148],"software":[137],"side.":[138],"addition,":[140],"introduce":[142],"two-level":[144],"compiler":[145],"optimize":[147],"parallelism":[149],"memory":[151],"allocation":[152],"schedule.":[153],"Experimental":[154],"results":[155],"show":[156],"that":[157],"FET-OPU":[159],"achieves":[160],"7.33-21.27\u00d7":[161],"speedup":[162],"231\u00d7":[164],"less":[165,177,191,209],"energy":[166,178,192,210],"consumption":[167,179,193],"compared":[168,180,194],"CPU,":[170],"1.56-4.08\u00d7":[172],"latency":[173,188],"reduction":[174],"5.85-66.36\u00d7":[176],"GPU.":[182],"Furthermore,":[183],"observe":[185],"1.56-8.21\u00d7":[186],"better":[187],"5.28-6.24\u00d7":[190],"previously":[196],"customized":[197],"FPGA/ASIC":[198],"be":[202],"2.05\u00d7":[203],"faster":[204],"than":[205],"NPE":[206],"5.55\u00d7":[208],"consumption.":[211]},"counts_by_year":[{"year":2025,"cited_by_count":12},{"year":2024,"cited_by_count":5}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
