{"id":"https://openalex.org/W7116429508","doi":"https://doi.org/10.1145/3754598.3754656","title":"Automated FPGA Accelerator Generation Framework for Transformers with Dataflow Optimization","display_name":"Automated FPGA Accelerator Generation Framework for Transformers with Dataflow Optimization","publication_year":2025,"publication_date":"2025-09-08","ids":{"openalex":"https://openalex.org/W7116429508","doi":"https://doi.org/10.1145/3754598.3754656"},"language":null,"primary_location":{"id":"doi:10.1145/3754598.3754656","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754656","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3754598.3754656","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5034765006","display_name":"Wenqi Lou","orcid":"https://orcid.org/0000-0002-2240-6672"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wenqi Lou","raw_affiliation_strings":["Suzhou Institute for Advanced Research, University of Science and Technology of China, Suzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-2240-6672","affiliations":[{"raw_affiliation_string":"Suzhou Institute for Advanced Research, University of Science and Technology of China, Suzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014908428","display_name":"Yunji Qin","orcid":"https://orcid.org/0009-0005-9404-581X"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yunji Qin","raw_affiliation_strings":["University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0009-0005-9404-581X","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zihao Wang","orcid":"https://orcid.org/0009-0008-0043-6433"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zihao Wang","raw_affiliation_strings":["University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0009-0008-0043-6433","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120903215","display_name":"Chao Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chao Wang","raw_affiliation_strings":["University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0002-9403-5575","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Lei Gong","orcid":"https://orcid.org/0000-0002-8391-5526"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]},{"id":"https://openalex.org/I16365422","display_name":"Hefei University of Technology","ror":"https://ror.org/02czkny70","country_code":"CN","type":"education","lineage":["https://openalex.org/I16365422"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Gong","raw_affiliation_strings":["University of Science and Technology of China, Hefei, China and Anhui Prov. Key Lab of HPC, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0002-8391-5526","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China and Anhui Prov. Key Lab of HPC, Hefei, China","institution_ids":["https://openalex.org/I126520041","https://openalex.org/I16365422"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5120886596","display_name":"Xuehai Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuehai Zhou","raw_affiliation_strings":["University of Science and Technology of China, Hefei, China"],"raw_orcid":"https://orcid.org/0000-0002-8360-3143","affiliations":[{"raw_affiliation_string":"University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5034765006"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.64750643,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"406","last_page":"416"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.7382000088691711,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.7382000088691711,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.10779999941587448,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11522","display_name":"VLSI and FPGA Design Techniques","score":0.03060000017285347,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/dataflow","display_name":"Dataflow","score":0.8233000040054321},{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.7113000154495239},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6266000270843506},{"id":"https://openalex.org/keywords/design-space-exploration","display_name":"Design space exploration","score":0.4788999855518341},{"id":"https://openalex.org/keywords/parameterized-complexity","display_name":"Parameterized complexity","score":0.4480000138282776},{"id":"https://openalex.org/keywords/adaptability","display_name":"Adaptability","score":0.4352000057697296},{"id":"https://openalex.org/keywords/efficient-energy-use","display_name":"Efficient energy use","score":0.4302000105381012},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.41760000586509705},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.4065999984741211}],"concepts":[{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.8233000040054321},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7545999884605408},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.7113000154495239},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6266000270843506},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.5266000032424927},{"id":"https://openalex.org/C2776221188","wikidata":"https://www.wikidata.org/wiki/Q21072556","display_name":"Design space exploration","level":2,"score":0.4788999855518341},{"id":"https://openalex.org/C165464430","wikidata":"https://www.wikidata.org/wiki/Q1570441","display_name":"Parameterized complexity","level":2,"score":0.4480000138282776},{"id":"https://openalex.org/C177606310","wikidata":"https://www.wikidata.org/wiki/Q5674297","display_name":"Adaptability","level":2,"score":0.4352000057697296},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.4302000105381012},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.41760000586509705},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.412200003862381},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.4065999984741211},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.40450000762939453},{"id":"https://openalex.org/C77390884","wikidata":"https://www.wikidata.org/wiki/Q217302","display_name":"Application-specific integrated circuit","level":2,"score":0.38600000739097595},{"id":"https://openalex.org/C142962650","wikidata":"https://www.wikidata.org/wiki/Q240838","display_name":"Reconfigurable computing","level":3,"score":0.3781000077724457},{"id":"https://openalex.org/C37135326","wikidata":"https://www.wikidata.org/wiki/Q931942","display_name":"Design flow","level":2,"score":0.37560001015663147},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.3675000071525574},{"id":"https://openalex.org/C172430144","wikidata":"https://www.wikidata.org/wiki/Q17111997","display_name":"Symmetric multiprocessor system","level":2,"score":0.358599990606308},{"id":"https://openalex.org/C13164978","wikidata":"https://www.wikidata.org/wiki/Q600158","display_name":"Hardware acceleration","level":3,"score":0.3578999936580658},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.3424000144004822},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.3384999930858612},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.3352999985218048},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.32359999418258667},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.311599999666214},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.3102000057697296},{"id":"https://openalex.org/C58013763","wikidata":"https://www.wikidata.org/wiki/Q5754574","display_name":"High-level synthesis","level":3,"score":0.28110000491142273},{"id":"https://openalex.org/C176727019","wikidata":"https://www.wikidata.org/wiki/Q1172415","display_name":"Dataflow architecture","level":3,"score":0.2784000039100647},{"id":"https://openalex.org/C65232700","wikidata":"https://www.wikidata.org/wiki/Q5656403","display_name":"Hardware architecture","level":3,"score":0.2782000005245209},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.2639999985694885},{"id":"https://openalex.org/C76970557","wikidata":"https://www.wikidata.org/wiki/Q1869750","display_name":"Loop unrolling","level":3,"score":0.25940001010894775}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3754598.3754656","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754656","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3754598.3754656","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754656","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2200096126","display_name":null,"funder_award_id":"XDB0660101","funder_id":"https://openalex.org/F4320321133","funder_display_name":"Chinese Academy of Sciences"},{"id":"https://openalex.org/G5069622250","display_name":null,"funder_award_id":"XDB0660000","funder_id":"https://openalex.org/F4320321133","funder_display_name":"Chinese Academy of Sciences"}],"funders":[{"id":"https://openalex.org/F4320321133","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W3047848469","https://openalex.org/W3090389586","https://openalex.org/W3102790199","https://openalex.org/W3112948415","https://openalex.org/W3130240120","https://openalex.org/W3159727696","https://openalex.org/W3198625877","https://openalex.org/W3206453033","https://openalex.org/W4213019189","https://openalex.org/W4214686755","https://openalex.org/W4286001027","https://openalex.org/W4308479898","https://openalex.org/W4318541578","https://openalex.org/W4360831786","https://openalex.org/W4385187240","https://openalex.org/W4388214800","https://openalex.org/W4389162698","https://openalex.org/W4394966842","https://openalex.org/W4399487406","https://openalex.org/W4403210793","https://openalex.org/W4404103067","https://openalex.org/W4404955137","https://openalex.org/W4405755146","https://openalex.org/W4411725841","https://openalex.org/W4413145941"],"related_works":[],"abstract_inverted_index":{"Transformers":[0],"have":[1],"revolutionized":[2],"natural":[3],"language":[4],"processing":[5],"(NLP)":[6],"and":[7,36,44,75,87,106,187,191],"computer":[8],"vision":[9,192],"(CV)":[10],"tasks,":[11],"yet":[12],"their":[13],"deployment":[14],"remains":[15],"constrained":[16],"by":[17,110],"the":[18,72,122,131,167],"quadratic":[19],"complexity":[20],"of":[21,124],"self-attention.":[22],"While":[23],"FPGA-based":[24],"accelerators":[25,58],"offer":[26],"promising":[27],"solutions,":[28],"existing":[29],"designs":[30],"struggle":[31],"with":[32,143,163],"fixed":[33],"dataflow":[34,64],"patterns":[35],"limited":[37],"hardware":[38,91],"specialization":[39],"across":[40,126,189],"diverse":[41],"model":[42],"architectures":[43],"sequence":[45],"lengths.":[46],"This":[47],"paper":[48],"presents":[49],"AutoTrans,":[50],"an":[51],"automated":[52],"framework":[53],"for":[54,78,104],"generating":[55],"optimized":[56],"FPGA":[57],"tailored":[59],"to":[60,137,151,173,178],"Transformers.":[61],"To":[62],"improve":[63],"flexibility,":[65],"we":[66,93],"propose":[67],"configurable":[68],"fusion":[69],"granularities":[70],"at":[71],"tensor,":[73],"row,":[74],"block":[76],"levels":[77],"self-attention":[79],"layers,":[80,108],"enabling":[81],"fine-grained":[82],"trade-offs":[83],"between":[84],"computational":[85],"efficiency":[86,157],"resource":[88],"utilization.":[89],"For":[90,161],"specialization,":[92],"develop":[94],"a":[95,111,148],"parameterized":[96],"heterogeneous":[97],"multi-core":[98],"architecture":[99],"featuring":[100],"dedicated":[101],"compute":[102,182],"engines":[103],"attention":[105],"linear":[107],"guided":[109],"genetic":[112],"algorithm-based":[113],"design":[114],"space":[115],"exploration":[116],"(DSE)":[117],"strategy.":[118],"Experimental":[119],"results":[120],"demonstrate":[121],"effectiveness":[123],"AutoTrans":[125,134,170],"varied":[127],"application":[128],"scenarios.":[129],"On":[130],"ZCU102":[132],"board,":[133],"achieves":[135],"up":[136,172,177],"621":[138],"GOPS":[139],"when":[140],"accelerating":[141],"BERT-base":[142],"4":[144],"K-token":[145],"inputs,":[146],"yielding":[147],"1.27":[149],"\u00d7":[150,153,180],"1.91":[152],"improvement":[154],"in":[155],"energy":[156],"over":[158],"previous":[159],"designs.":[160],"ViT-Base":[162],"256-token":[164],"inputs":[165],"on":[166],"Alveo":[168],"U50,":[169],"attains":[171],"1548":[174],"GOPS,":[175],"achieving":[176],"1.96":[179],"higher":[181],"density,":[183],"highlighting":[184],"its":[185],"scalability":[186],"adaptability":[188],"NLP":[190],"domains.":[193]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-12-21T00:00:00"}
