{"id":"https://openalex.org/W4413755548","doi":"https://doi.org/10.1109/isvlsi65124.2025.11130250","title":"FastMamba: A High-Speed and Efficient Mamba Accelerator on FPGA with Accurate Quantization","display_name":"FastMamba: A High-Speed and Efficient Mamba Accelerator on FPGA with Accurate Quantization","publication_year":2025,"publication_date":"2025-07-06","ids":{"openalex":"https://openalex.org/W4413755548","doi":"https://doi.org/10.1109/isvlsi65124.2025.11130250"},"language":"en","primary_location":{"id":"doi:10.1109/isvlsi65124.2025.11130250","is_oa":false,"landing_page_url":"https://doi.org/10.1109/isvlsi65124.2025.11130250","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Computer Society Annual Symposium on VLSI (ISVLSI)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Aotao Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Aotao Wang","raw_affiliation_strings":["Nanjing University,School of Electronic Science and Engineering,Nanjing,China"],"affiliations":[{"raw_affiliation_string":"Nanjing University,School of Electronic Science and Engineering,Nanjing,China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090170161","display_name":"Haikuo Shao","orcid":"https://orcid.org/0009-0008-6965-3436"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haikuo Shao","raw_affiliation_strings":["Nanjing University,School of Electronic Science and Engineering,Nanjing,China"],"affiliations":[{"raw_affiliation_string":"Nanjing University,School of Electronic Science and Engineering,Nanjing,China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109277114","display_name":"Shaobo Ma","orcid":null},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shaobo Ma","raw_affiliation_strings":["Nanjing University,School of Electronic Science and Engineering,Nanjing,China"],"affiliations":[{"raw_affiliation_string":"Nanjing University,School of Electronic Science and Engineering,Nanjing,China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100696995","display_name":"Zhongfeng Wang","orcid":"https://orcid.org/0000-0002-7227-4786"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhongfeng Wang","raw_affiliation_strings":["Nanjing University,School of Electronic Science and Engineering,Nanjing,China"],"affiliations":[{"raw_affiliation_string":"Nanjing University,School of Electronic Science and Engineering,Nanjing,China","institution_ids":["https://openalex.org/I881766915"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I881766915"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.23259168,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.9735000133514404,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.9735000133514404,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11992","display_name":"CCD and CMOS Imaging Sensors","score":0.9642000198364258,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13182","display_name":"Quantum-Dot Cellular Automata","score":0.95169997215271,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.8301159143447876},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6245603561401367},{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.47772595286369324},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.44609126448631287},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.06723061203956604}],"concepts":[{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.8301159143447876},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6245603561401367},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.47772595286369324},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.44609126448631287},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.06723061203956604}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/isvlsi65124.2025.11130250","is_oa":false,"landing_page_url":"https://doi.org/10.1109/isvlsi65124.2025.11130250","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Computer Society Annual Symposium on VLSI (ISVLSI)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/7","score":0.8199999928474426,"display_name":"Affordable and clean energy"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":7,"referenced_works":["https://openalex.org/W3036061420","https://openalex.org/W3217128056","https://openalex.org/W4226322872","https://openalex.org/W4320930577","https://openalex.org/W4400878378","https://openalex.org/W4403842425","https://openalex.org/W4409282403"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2111241003","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2096844293","https://openalex.org/W2363944576","https://openalex.org/W2351041855","https://openalex.org/W2570254841"],"abstract_inverted_index":{"State":[0],"Space":[1],"Models":[2],"(SSMs),":[3],"like":[4],"recent":[5],"Mamba2,":[6],"have":[7],"achieved":[8],"remarkable":[9],"performance":[10],"and":[11,35,40,89,100,103,134,145,169,178],"received":[12],"extensive":[13],"attention.":[14],"However,":[15],"deploying":[16],"Mamba2":[17],"on":[18,59,116,153,162],"resource-constrained":[19],"edge":[20],"devices":[21],"encounters":[22],"many":[23],"problems:":[24],"severe":[25],"outliers":[26],"within":[27],"the":[28,32,45,66,97,112,117,158,185],"linear":[29,78,106],"layer":[30],"challenging":[31],"quantization,":[33,120],"diverse":[34],"irregular":[36],"element-wise":[37],"tensor":[38],"operations,":[39],"hardware-unfriendly":[41],"nonlinear":[42,113],"functions":[43],"in":[44],"SSM":[46,98,137],"block.":[47],"To":[48],"address":[49],"these":[50],"issues,":[51],"this":[52],"paper":[53],"presents":[54],"FastMamba,":[55],"a":[56,87,104],"dedicated":[57],"accelerator":[58,124],"FPGA":[60],"with":[61,189],"hardware-algorithm":[62],"co-design":[63],"to":[64,83,110],"promote":[65],"deployment":[67],"efficiency":[68,144,197],"of":[69],"Mamba2.":[70],"Specifically,":[71],"we":[72,121,150],"successfully":[73],"achieve":[74],"8-bit":[75],"quantization":[76,92],"for":[77,96],"layers":[79],"through":[80],"Hadamard":[81],"transformation":[82],"eliminate":[84],"outliers.":[85],"Moreover,":[86],"hardware-friendly":[88],"finegrained":[90],"power-of-two":[91],"framework":[93],"is":[94,108],"presented":[95],"block":[99],"convolution":[101],"layer,":[102],"first-order":[105],"approximation":[107],"developed":[109],"optimize":[111],"functions.":[114],"Based":[115],"accurate":[118],"algorithm":[119],"propose":[122],"an":[123,135],"that":[125],"integrates":[126],"parallel":[127],"vector":[128],"processing":[129],"units,":[130],"pipelined":[131],"execution":[132],"dataflow,":[133],"efficient":[136],"Nonlinear":[138],"Approximation":[139],"Unit,":[140],"which":[141],"enhances":[142],"computational":[143],"reduces":[146],"hardware":[147],"complexity.":[148],"Finally,":[149],"evaluate":[151],"FastMamba":[152,165,191],"Xilinx":[154],"VC709":[155],"FPGA.":[156],"For":[157],"input":[159],"prefill":[160],"task":[161],"Mamba2130":[163],"M,":[164],"achieves":[166],"$68.80":[167],"\\times$":[168,171,194],"$8.90":[170],"speedup":[172],"over":[173],"Intel":[174],"Xeon":[175],"4210R":[176],"CPU":[177],"NVIDIA":[179],"RTX":[180,199],"3090":[181,200],"GPU,":[182],"respectively.":[183],"In":[184],"output":[186],"decode":[187],"experiment":[188],"Mamba2-2.7B,":[190],"attains":[192],"$1.65":[193],"higher":[195],"energy":[196],"than":[198],"GPU.":[201]},"counts_by_year":[],"updated_date":"2026-04-17T18:11:37.981687","created_date":"2025-10-10T00:00:00"}
