{"id":"https://openalex.org/W4416429745","doi":"https://doi.org/10.1109/iccad66269.2025.11240811","title":"ToMamba: Towards Token-Efficient Mamba Architecture on FPGA","display_name":"ToMamba: Towards Token-Efficient Mamba Architecture on FPGA","publication_year":2025,"publication_date":"2025-10-26","ids":{"openalex":"https://openalex.org/W4416429745","doi":"https://doi.org/10.1109/iccad66269.2025.11240811"},"language":null,"primary_location":{"id":"doi:10.1109/iccad66269.2025.11240811","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccad66269.2025.11240811","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM International Conference On Computer Aided Design (ICCAD)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102312689","display_name":"Kejia Shi","orcid":"https://orcid.org/0009-0001-8471-0995"},"institutions":[{"id":"https://openalex.org/I4210132426","display_name":"Shanghai Fudan Microelectronics (China)","ror":"https://ror.org/02vfj3j86","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210132426"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Kejia Shi","raw_affiliation_strings":["Fudan University,State Key Lab of Integrated Chips &#x0026; Systems, and School of Microelectronics,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Fudan University,State Key Lab of Integrated Chips &#x0026; Systems, and School of Microelectronics,Shanghai,China","institution_ids":["https://openalex.org/I4210132426"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001376937","display_name":"Yue Cao","orcid":"https://orcid.org/0000-0001-6948-7222"},"institutions":[{"id":"https://openalex.org/I4210132426","display_name":"Shanghai Fudan Microelectronics (China)","ror":"https://ror.org/02vfj3j86","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210132426"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yue Cao","raw_affiliation_strings":["Fudan University,State Key Lab of Integrated Chips &#x0026; Systems, and School of Microelectronics,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Fudan University,State Key Lab of Integrated Chips &#x0026; Systems, and School of Microelectronics,Shanghai,China","institution_ids":["https://openalex.org/I4210132426"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101685746","display_name":"Yuhang Du","orcid":"https://orcid.org/0000-0003-3821-4322"},"institutions":[{"id":"https://openalex.org/I4210132426","display_name":"Shanghai Fudan Microelectronics (China)","ror":"https://ror.org/02vfj3j86","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210132426"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuhang Du","raw_affiliation_strings":["Fudan University,State Key Lab of Integrated Chips &#x0026; Systems, and School of Microelectronics,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Fudan University,State Key Lab of Integrated Chips &#x0026; Systems, and School of Microelectronics,Shanghai,China","institution_ids":["https://openalex.org/I4210132426"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113333659","display_name":"Jianli Chen","orcid":"https://orcid.org/0000-0001-6773-6248"},"institutions":[{"id":"https://openalex.org/I4210132426","display_name":"Shanghai Fudan Microelectronics (China)","ror":"https://ror.org/02vfj3j86","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210132426"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianli Chen","raw_affiliation_strings":["Fudan University,State Key Lab of Integrated Chips &#x0026; Systems, and School of Microelectronics,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Fudan University,State Key Lab of Integrated Chips &#x0026; Systems, and School of Microelectronics,Shanghai,China","institution_ids":["https://openalex.org/I4210132426"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101659303","display_name":"Jun Yu","orcid":"https://orcid.org/0000-0001-5029-0294"},"institutions":[{"id":"https://openalex.org/I4210132426","display_name":"Shanghai Fudan Microelectronics (China)","ror":"https://ror.org/02vfj3j86","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210132426"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jun Yu","raw_affiliation_strings":["Fudan University,State Key Lab of Integrated Chips &#x0026; Systems, and School of Microelectronics,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Fudan University,State Key Lab of Integrated Chips &#x0026; Systems, and School of Microelectronics,Shanghai,China","institution_ids":["https://openalex.org/I4210132426"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5115695319","display_name":"Kun Wang","orcid":"https://orcid.org/0009-0006-7995-3796"},"institutions":[{"id":"https://openalex.org/I4210132426","display_name":"Shanghai Fudan Microelectronics (China)","ror":"https://ror.org/02vfj3j86","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210132426"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kun Wang","raw_affiliation_strings":["Fudan University,State Key Lab of Integrated Chips &#x0026; Systems, and School of Microelectronics,Shanghai,China"],"affiliations":[{"raw_affiliation_string":"Fudan University,State Key Lab of Integrated Chips &#x0026; Systems, and School of Microelectronics,Shanghai,China","institution_ids":["https://openalex.org/I4210132426"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5102312689"],"corresponding_institution_ids":["https://openalex.org/I4210132426"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.37718554,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"9"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.4223000109195709,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.4223000109195709,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.16050000488758087,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11697","display_name":"Numerical Methods and Algorithms","score":0.15800000727176666,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.6913999915122986},{"id":"https://openalex.org/keywords/application-specific-integrated-circuit","display_name":"Application-specific integrated circuit","score":0.557699978351593},{"id":"https://openalex.org/keywords/xeon","display_name":"Xeon","score":0.4966000020503998},{"id":"https://openalex.org/keywords/voltage-reduction","display_name":"Voltage reduction","score":0.4754999876022339},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.47130000591278076},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.47040000557899475},{"id":"https://openalex.org/keywords/design-space-exploration","display_name":"Design space exploration","score":0.454800009727478},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.44350001215934753},{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.41019999980926514},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4075999855995178}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7638999819755554},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.6913999915122986},{"id":"https://openalex.org/C77390884","wikidata":"https://www.wikidata.org/wiki/Q217302","display_name":"Application-specific integrated circuit","level":2,"score":0.557699978351593},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.5333999991416931},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.5},{"id":"https://openalex.org/C145108525","wikidata":"https://www.wikidata.org/wiki/Q656154","display_name":"Xeon","level":2,"score":0.4966000020503998},{"id":"https://openalex.org/C2780745134","wikidata":"https://www.wikidata.org/wiki/Q7940751","display_name":"Voltage reduction","level":3,"score":0.4754999876022339},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.47130000591278076},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.47040000557899475},{"id":"https://openalex.org/C2776221188","wikidata":"https://www.wikidata.org/wiki/Q21072556","display_name":"Design space exploration","level":2,"score":0.454800009727478},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4474000036716461},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.44350001215934753},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.41019999980926514},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4075999855995178},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.40119999647140503},{"id":"https://openalex.org/C58013763","wikidata":"https://www.wikidata.org/wiki/Q5754574","display_name":"High-level synthesis","level":3,"score":0.3903999924659729},{"id":"https://openalex.org/C142962650","wikidata":"https://www.wikidata.org/wiki/Q240838","display_name":"Reconfigurable computing","level":3,"score":0.37610000371932983},{"id":"https://openalex.org/C65232700","wikidata":"https://www.wikidata.org/wiki/Q5656403","display_name":"Hardware architecture","level":3,"score":0.3495999872684479},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.3336000144481659},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.3321000039577484},{"id":"https://openalex.org/C2779106878","wikidata":"https://www.wikidata.org/wiki/Q1257510","display_name":"Digital audio broadcasting","level":2,"score":0.32910001277923584},{"id":"https://openalex.org/C170723468","wikidata":"https://www.wikidata.org/wiki/Q182933","display_name":"x86","level":3,"score":0.3156000077724457},{"id":"https://openalex.org/C13164978","wikidata":"https://www.wikidata.org/wiki/Q600158","display_name":"Hardware acceleration","level":3,"score":0.31459999084472656},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.305400013923645},{"id":"https://openalex.org/C75172450","wikidata":"https://www.wikidata.org/wiki/Q623950","display_name":"Fast Fourier transform","level":2,"score":0.2976999878883362},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.29429998993873596},{"id":"https://openalex.org/C84462506","wikidata":"https://www.wikidata.org/wiki/Q173142","display_name":"Digital signal processing","level":2,"score":0.29109999537467957},{"id":"https://openalex.org/C22174128","wikidata":"https://www.wikidata.org/wiki/Q175869","display_name":"Microcode","level":2,"score":0.29010000824928284},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.289000004529953},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.2818000018596649},{"id":"https://openalex.org/C106516650","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm design","level":2,"score":0.2784999907016754},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.27149999141693115},{"id":"https://openalex.org/C118021083","wikidata":"https://www.wikidata.org/wiki/Q610398","display_name":"System on a chip","level":2,"score":0.2709999978542328},{"id":"https://openalex.org/C19275194","wikidata":"https://www.wikidata.org/wiki/Q222903","display_name":"Multiplexing","level":2,"score":0.26499998569488525},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.26179999113082886},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.25949999690055847}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iccad66269.2025.11240811","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccad66269.2025.11240811","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM International Conference On Computer Aided Design (ICCAD)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W2115452265","https://openalex.org/W2795915628","https://openalex.org/W2946609015","https://openalex.org/W2998617917","https://openalex.org/W3017024317","https://openalex.org/W3159727696","https://openalex.org/W3194676777","https://openalex.org/W3213174544","https://openalex.org/W4291653336","https://openalex.org/W4385245566","https://openalex.org/W4385572090","https://openalex.org/W4389166707","https://openalex.org/W4389524393","https://openalex.org/W4392181806","https://openalex.org/W4403277448","https://openalex.org/W4404133659","https://openalex.org/W4404133677","https://openalex.org/W4404133735","https://openalex.org/W4404133870","https://openalex.org/W4404783218","https://openalex.org/W4409282403","https://openalex.org/W4410553236"],"related_works":[],"abstract_inverted_index":{"The":[0],"State":[1],"Space":[2],"Model":[3],"(SSM),":[4],"particularly":[5],"the":[6,33,38,70,94,103,125,181,199],"Mamba":[7,100,182],"implementation,":[8],"has":[9],"demonstrated":[10],"impressive":[11],"capabilities":[12],"across":[13,194],"various":[14],"domains.":[15],"It":[16],"offers":[17],"a":[18,108,119,128,187],"significant":[19],"reduction":[20,57,201],"in":[21,60,67,217],"computational":[22],"complexity":[23],"compared":[24,219,234,253],"to":[25,42,63,98,134,157,215,220,231,235,254],"Transformers":[26,61],"while":[27],"achieving":[28],"higher":[29],"algorithm":[30,113],"accuracy.":[31],"However,":[32],"ineffectiveness":[34],"of":[35,78,203],"spatially":[36],"unfolding":[37],"SSM":[39,155],"layer":[40],"leads":[41],"increased":[43],"latency":[44],"as":[45],"sentence":[46],"length":[47],"grows,":[48],"especially":[49],"when":[50],"being":[51],"deployed":[52],"on":[53,85,226],"FPGA.":[54,86],"Previous":[55],"token":[56,111,200],"methods":[58],"introduced":[59],"fail":[62],"maintain":[64],"high":[65],"performance":[66],"Mamba.":[68],"Moreover,":[69],"dispersed":[71],"outliers,":[72],"complex":[73],"model":[74],"structure":[75],"and":[76,118,137,144,161,168,245,248,256],"variety":[77],"non-linear":[79],"operators":[80],"obstruct":[81],"its":[82],"efficient":[83,165],"implementation":[84,225],"To":[87,177],"address":[88],"these":[89],"challenges,":[90],"we":[91],"propose":[92],"ToMamba,":[93],"first":[95],"algorithm-architecture":[96],"co-design":[97],"optimize":[99],"implementation.":[101],"At":[102],"algorithmic":[104],"level,":[105],"ToMamba":[106,184,204,223],"incorporates":[107],"novel":[109],"progressive":[110],"merging":[112,179],"with":[114,208],"minimal":[115],"hardware":[116,126,159,166,224],"consumption":[117],"hardware-aware":[120],"fine-grained":[121,149],"quantization":[122],"strategy.":[123],"On":[124],"side,":[127],"dualflow":[129],"systolic":[130],"array":[131],"is":[132,152],"designed":[133],"unify":[135],"convolution":[136],"matrix":[138],"multiplication,":[139],"supporting":[140],"both":[141],"weight":[142],"stationary":[143,146],"output":[145],"dataflow.":[147],"A":[148],"pipeline":[150],"design":[151],"adopted":[153],"for":[154,171],"computation":[156],"maximize":[158],"efficiency":[160,251],"enhance":[162],"throughput.":[163],"Furthermore,":[164],"architecture":[167],"approximation":[169],"method":[170,202],"nonlinear":[172],"function":[173],"units":[174],"are":[175],"proposed.":[176],"enable":[178],"after":[180],"layer,":[183],"also":[185],"adopts":[186],"dedicated":[188],"data":[189],"mapping":[190],"scheme.":[191],"Comprehensive":[192],"evaluations":[193],"multiple":[195],"benchmarks":[196],"demonstrate":[197],"that":[198],"achieves":[205,229],"10%":[206],"sparsity":[207],"only":[209],"0.25%":[210],"accuracy":[211,218],"loss,":[212],"improving":[213],"up":[214,230],"16.89%":[216],"previous":[221],"methods.":[222],"U280":[227],"FPGA":[228],"636.00\u00d7/11.01\u00d7/1.39\u00d7":[232],"speedup":[233],"Intel":[236],"Xeon":[237],"Platinum":[238],"8369B":[239],"CPU,":[240],"NVIDIA":[241],"Tesla":[242],"A100":[243],"GPU":[244,257],"ASIC":[246],"platforms":[247],"1280\u00d7/44.32\u00d7":[249],"energy":[250],"improvement":[252],"CPU":[255],"platforms.":[258]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-20T00:00:00"}
