{"id":"https://openalex.org/W4413120551","doi":"https://doi.org/10.1109/tcad.2025.3597244","title":"Ultra Memory-Efficient On-FPGA Training of Transformers via Tensor-Compressed Optimization","display_name":"Ultra Memory-Efficient On-FPGA Training of Transformers via Tensor-Compressed Optimization","publication_year":2025,"publication_date":"2025-08-08","ids":{"openalex":"https://openalex.org/W4413120551","doi":"https://doi.org/10.1109/tcad.2025.3597244"},"language":"en","primary_location":{"id":"doi:10.1109/tcad.2025.3597244","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcad.2025.3597244","pdf_url":null,"source":{"id":"https://openalex.org/S100835903","display_name":"IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems","issn_l":"0278-0070","issn":["0278-0070","1937-4151"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113330192","display_name":"Jiayi Tian","orcid":null},"institutions":[{"id":"https://openalex.org/I154570441","display_name":"University of California, Santa Barbara","ror":"https://ror.org/02t274463","country_code":"US","type":"education","lineage":["https://openalex.org/I154570441"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jiayi Tian","raw_affiliation_strings":["Department of Electrical and Computer Engineering, University of California at Santa Barbara, Santa Barbara, CA, USA","Department of Electrical and Computer Engineering, University of California, Santa Barbara, CA, USA"],"raw_orcid":"https://orcid.org/0009-0006-2962-7386","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, University of California at Santa Barbara, Santa Barbara, CA, USA","institution_ids":["https://openalex.org/I154570441"]},{"raw_affiliation_string":"Department of Electrical and Computer Engineering, University of California, Santa Barbara, CA, USA","institution_ids":["https://openalex.org/I154570441"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jinming Lu","orcid":"https://orcid.org/0009-0007-0192-3703"},"institutions":[{"id":"https://openalex.org/I154570441","display_name":"University of California, Santa Barbara","ror":"https://ror.org/02t274463","country_code":"US","type":"education","lineage":["https://openalex.org/I154570441"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jinming Lu","raw_affiliation_strings":["Department of Electrical and Computer Engineering, University of California at Santa Barbara, Santa Barbara, CA, USA","Department of Electrical and Computer Engineering, University of California, Santa Barbara, CA, USA"],"raw_orcid":"https://orcid.org/0009-0007-0192-3703","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, University of California at Santa Barbara, Santa Barbara, CA, USA","institution_ids":["https://openalex.org/I154570441"]},{"raw_affiliation_string":"Department of Electrical and Computer Engineering, University of California, Santa Barbara, CA, USA","institution_ids":["https://openalex.org/I154570441"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035225490","display_name":"Hai Li","orcid":"https://orcid.org/0000-0001-7668-569X"},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]},{"id":"https://openalex.org/I154570441","display_name":"University of California, Santa Barbara","ror":"https://ror.org/02t274463","country_code":"US","type":"education","lineage":["https://openalex.org/I154570441"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hai Li","raw_affiliation_strings":["Department of Electrical and Computer Engineering, University of California at Santa Barbara, Santa Barbara, CA, USA","Intel Corporation, Hillsboro, OR, USA"],"raw_orcid":"https://orcid.org/0000-0001-7668-569X","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, University of California at Santa Barbara, Santa Barbara, CA, USA","institution_ids":["https://openalex.org/I154570441"]},{"raw_affiliation_string":"Intel Corporation, Hillsboro, OR, USA","institution_ids":["https://openalex.org/I1343180700"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101688508","display_name":"Xiangwei Wang","orcid":"https://orcid.org/0009-0001-3649-0852"},"institutions":[{"id":"https://openalex.org/I137902535","display_name":"North Carolina State University","ror":"https://ror.org/04tj63d06","country_code":"US","type":"education","lineage":["https://openalex.org/I137902535"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiangwei Wang","raw_affiliation_strings":["Department of Computer Science, North Carolina State University, Raleigh, NC, USA","Department of Computer Science, North Carolina State University, USA"],"raw_orcid":"https://orcid.org/0009-0001-3649-0852","affiliations":[{"raw_affiliation_string":"Department of Computer Science, North Carolina State University, Raleigh, NC, USA","institution_ids":["https://openalex.org/I137902535"]},{"raw_affiliation_string":"Department of Computer Science, North Carolina State University, USA","institution_ids":["https://openalex.org/I137902535"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053022655","display_name":"Cong Hao","orcid":"https://orcid.org/0000-0002-2541-8767"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Cong Callie Hao","raw_affiliation_strings":["School of Electrical and Computer Engineering, Georgia Institute of Technology, Atlanta, GA, USA"],"raw_orcid":"https://orcid.org/0000-0002-2541-8767","affiliations":[{"raw_affiliation_string":"School of Electrical and Computer Engineering, Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073072521","display_name":"Ian A. Young","orcid":"https://orcid.org/0000-0002-4017-5265"},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ian Young","raw_affiliation_strings":["Department of Technology Research, Intel Corporation, Hillsboro, OR, USA","Intel Corporation, Hillsboro, OR, USA"],"raw_orcid":"https://orcid.org/0000-0002-4017-5265","affiliations":[{"raw_affiliation_string":"Department of Technology Research, Intel Corporation, Hillsboro, OR, USA","institution_ids":["https://openalex.org/I1343180700"]},{"raw_affiliation_string":"Intel Corporation, Hillsboro, OR, USA","institution_ids":["https://openalex.org/I1343180700"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100459107","display_name":"Zheng Zhang","orcid":"https://orcid.org/0000-0002-2292-0030"},"institutions":[{"id":"https://openalex.org/I154570441","display_name":"University of California, Santa Barbara","ror":"https://ror.org/02t274463","country_code":"US","type":"education","lineage":["https://openalex.org/I154570441"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zheng Zhang","raw_affiliation_strings":["Department of Electrical and Computer Engineering, University of California at Santa Barbara, Santa Barbara, CA, USA","Department of Electrical and Computer Engineering, University of California, Santa Barbara, CA, USA"],"raw_orcid":"https://orcid.org/0000-0002-2292-0030","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, University of California at Santa Barbara, Santa Barbara, CA, USA","institution_ids":["https://openalex.org/I154570441"]},{"raw_affiliation_string":"Department of Electrical and Computer Engineering, University of California, Santa Barbara, CA, USA","institution_ids":["https://openalex.org/I154570441"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.864,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.78565109,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":"45","issue":"3","first_page":"1352","last_page":"1365"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11283","display_name":"Experimental Learning in Engineering","score":0.98580002784729,"subfield":{"id":"https://openalex.org/subfields/2214","display_name":"Media Technology"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11283","display_name":"Experimental Learning in Engineering","score":0.98580002784729,"subfield":{"id":"https://openalex.org/subfields/2214","display_name":"Media Technology"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13182","display_name":"Quantum-Dot Cellular Automata","score":0.9621000289916992,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10682","display_name":"Quantum Computing Algorithms and Architecture","score":0.9535999894142151,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.6124728918075562},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5251176953315735},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4597797989845276},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.45262643694877625},{"id":"https://openalex.org/keywords/computer-hardware","display_name":"Computer hardware","score":0.36749082803726196},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.3653985261917114},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.3569743037223816},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.21152621507644653},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.18822649121284485},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.061709463596343994}],"concepts":[{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.6124728918075562},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5251176953315735},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4597797989845276},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.45262643694877625},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.36749082803726196},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3653985261917114},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.3569743037223816},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.21152621507644653},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.18822649121284485},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.061709463596343994}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tcad.2025.3597244","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcad.2025.3597244","pdf_url":null,"source":{"id":"https://openalex.org/S100835903","display_name":"IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems","issn_l":"0278-0070","issn":["0278-0070","1937-4151"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.8399999737739563,"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1993482030","https://openalex.org/W2024165284","https://openalex.org/W2077302143","https://openalex.org/W2572504188","https://openalex.org/W2787094505","https://openalex.org/W2962821792","https://openalex.org/W2963838731","https://openalex.org/W2985255011","https://openalex.org/W2988559961","https://openalex.org/W3003315945","https://openalex.org/W3048230991","https://openalex.org/W3095102311","https://openalex.org/W3103522166","https://openalex.org/W3118185732","https://openalex.org/W3134893068","https://openalex.org/W3135347465","https://openalex.org/W3159210794","https://openalex.org/W3198697344","https://openalex.org/W3206190169","https://openalex.org/W3208633927","https://openalex.org/W4210823275","https://openalex.org/W4213446428","https://openalex.org/W4285446270","https://openalex.org/W4291653336","https://openalex.org/W4297841396","https://openalex.org/W4372265769","https://openalex.org/W4380875583","https://openalex.org/W4385245566","https://openalex.org/W4385822522","https://openalex.org/W4386902741","https://openalex.org/W4389163044","https://openalex.org/W4390423602","https://openalex.org/W4390874575","https://openalex.org/W4400231005","https://openalex.org/W4401042806","https://openalex.org/W4404783269","https://openalex.org/W4412888769"],"related_works":["https://openalex.org/W2111241003","https://openalex.org/W2355315220","https://openalex.org/W4200391368","https://openalex.org/W2210979487","https://openalex.org/W2316202402","https://openalex.org/W2074043759","https://openalex.org/W2082487009","https://openalex.org/W2373535795","https://openalex.org/W3103262449","https://openalex.org/W2787763930"],"abstract_inverted_index":{"Transformer":[0],"models":[1,161],"have":[2],"achieved":[3],"state-of-the-art":[4],"performance":[5],"across":[6],"a":[7,77,191,216],"wide":[8],"range":[9],"of":[10,53,194,219,257],"machine":[11,36],"learning":[12],"tasks.":[13],"There":[14],"is":[15],"growing":[16],"interest":[17],"in":[18,122],"training":[19,48,142,183,205,214,240,260],"transformers":[20],"on":[21,113,159,171,184,206,241,261],"resource-constrained":[22],"edge":[23,55,262],"devices":[24],"due":[25],"to":[26,95,150,164,203,221,229],"considerations":[27],"such":[28],"as":[29],"privacy,":[30],"domain":[31],"adaptation,":[32],"and":[33,42,90,110,128,131,144,148,154,199],"on-device":[34],"scientific":[35],"learning.":[37],"However,":[38],"the":[39,51,64,72,87,100,172,185,207,254],"significant":[40,255],"computational":[41,88],"memory":[43,92,155,192,217],"demands":[44],"required":[45],"for":[46,68,81,119,140],"transformer":[47,69,83,160,239],"often":[49],"exceed":[50],"capabilities":[52],"an":[54,116,242,248],"device.":[56],"Leveraging":[57],"low-rank":[58],"tensor":[59,97,238,259],"compression,":[60],"this":[61,251],"paper":[62],"presents":[63],"first":[65],"on-FPGA":[66,213],"accelerator":[67,178,225],"training.":[70,123],"On":[71,99],"algorithm":[73],"side,":[74,102],"we":[75,103,135],"present":[76],"bi-directional":[78],"contraction":[79],"flow":[80],"tensorized":[82,176],"training,":[84],"significantly":[85],"reducing":[86],"FLOPS":[89],"intra-layer":[91,146],"costs":[93],"compared":[94,236],"existing":[96],"operations.":[98],"hardware":[101],"store":[104],"all":[105],"highly":[106],"compressed":[107],"model":[108],"parameters":[109],"gradient":[111],"information":[112],"chip,":[114],"creating":[115],"on-chip-memory-only":[117],"framework":[118],"each":[120,141],"stage":[121,143],"This":[124],"reduces":[125],"off-chip":[126],"communication":[127],"minimizes":[129],"latency":[130],"energy":[132,232],"costs.":[133],"Additionally,":[134],"implement":[136],"custom":[137],"computing":[138],"kernels":[139],"employ":[145],"parallelism":[147],"pipe-lining":[149],"further":[151],"enhance":[152],"run-time":[153],"efficiency.":[156],"Through":[157],"experiments":[158],"within":[162],"36.7":[163],"93.5":[165],"MB":[166],"using":[167],"FP-32":[168],"data":[169],"formats":[170],"ATIS":[173],"dataset,":[174],"our":[175,212],"FPGA":[177,224],"could":[179],"conduct":[180],"single-batch":[181],"end-to-end":[182],"AMD":[186],"Alevo":[187],"U50":[188],"FPGA,":[189],"with":[190,237],"budget":[193],"less":[195,231],"than":[196],"6-MB":[197],"BRAM":[198],"22.5-MB":[200],"URAM.":[201],"Compared":[202],"uncompressed":[204],"NVIDIA":[208,243],"RTX":[209,244],"3090":[210,245],"GPU,":[211],"achieves":[215,227],"reduction":[218],"30\u00d7":[220],"51\u00d7.":[222],"Our":[223],"also":[226],"up":[228],"4.0\u00d7":[230],"cost":[233],"per":[234],"epoch":[235],"GPU.":[246],"As":[247],"initial":[249],"result,":[250],"work":[252],"highlights":[253],"potential":[256],"large-scale":[258],"devices.":[263]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
