{"id":"https://openalex.org/W4414198410","doi":"https://doi.org/10.1109/dac63849.2025.11132895","title":"MambaOPU: An FPGA Overlay Processor for State-space-duality-based Mamba Models","display_name":"MambaOPU: An FPGA Overlay Processor for State-space-duality-based Mamba Models","publication_year":2025,"publication_date":"2025-06-22","ids":{"openalex":"https://openalex.org/W4414198410","doi":"https://doi.org/10.1109/dac63849.2025.11132895"},"language":"en","primary_location":{"id":"doi:10.1109/dac63849.2025.11132895","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11132895","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101262539","display_name":"Shaoqiang Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shaoqiang Lu","raw_affiliation_strings":["Ningbo Institute of Digital Twin, Eastern Institute of Technology,Ningbo,China"],"affiliations":[{"raw_affiliation_string":"Ningbo Institute of Digital Twin, Eastern Institute of Technology,Ningbo,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067237630","display_name":"Xuliang Yu","orcid":"https://orcid.org/0000-0001-8414-6960"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuliang Yu","raw_affiliation_strings":["Zhejiang University,Hangzhou,China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University,Hangzhou,China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073130132","display_name":"Tiandong Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I161318765","display_name":"University of California, Los Angeles","ror":"https://ror.org/046rm7j60","country_code":"US","type":"education","lineage":["https://openalex.org/I161318765"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tiandong Zhao","raw_affiliation_strings":["University of California,Los Angeles,USA"],"affiliations":[{"raw_affiliation_string":"University of California,Los Angeles,USA","institution_ids":["https://openalex.org/I161318765"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107092427","display_name":"Siyuan Miao","orcid":null},"institutions":[{"id":"https://openalex.org/I161318765","display_name":"University of California, Los Angeles","ror":"https://ror.org/046rm7j60","country_code":"US","type":"education","lineage":["https://openalex.org/I161318765"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Siyuan Miao","raw_affiliation_strings":["University of California,Los Angeles,USA"],"affiliations":[{"raw_affiliation_string":"University of California,Los Angeles,USA","institution_ids":["https://openalex.org/I161318765"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112166204","display_name":"X. Sheng","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinsong Sheng","raw_affiliation_strings":["Zhejiang University,Hangzhou,China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University,Hangzhou,China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102825845","display_name":"Chen Wu","orcid":"https://orcid.org/0000-0001-6035-5004"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen Wu","raw_affiliation_strings":["Ningbo Institute of Digital Twin, Eastern Institute of Technology,Ningbo,China"],"affiliations":[{"raw_affiliation_string":"Ningbo Institute of Digital Twin, Eastern Institute of Technology,Ningbo,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103028181","display_name":"Liang Zhao","orcid":"https://orcid.org/0000-0003-1796-460X"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liang Zhao","raw_affiliation_strings":["Zhejiang University,Hangzhou,China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University,Hangzhou,China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101612957","display_name":"Ting-Jung Lin","orcid":"https://orcid.org/0000-0002-5208-482X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ting-Jung Lin","raw_affiliation_strings":["Ningbo Institute of Digital Twin, Eastern Institute of Technology,Ningbo,China"],"affiliations":[{"raw_affiliation_string":"Ningbo Institute of Digital Twin, Eastern Institute of Technology,Ningbo,China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5008695429","display_name":"Lei He","orcid":"https://orcid.org/0000-0002-5266-3805"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lei He","raw_affiliation_strings":["Ningbo Institute of Digital Twin, Eastern Institute of Technology,Ningbo,China"],"affiliations":[{"raw_affiliation_string":"Ningbo Institute of Digital Twin, Eastern Institute of Technology,Ningbo,China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5101262539"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.28099437,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.9466999769210815,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.9466999769210815,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.6606000065803528},{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.5275999903678894},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.46860000491142273},{"id":"https://openalex.org/keywords/overlay","display_name":"Overlay","score":0.45669999718666077},{"id":"https://openalex.org/keywords/operator","display_name":"Operator (biology)","score":0.45649999380111694},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.39739999175071716},{"id":"https://openalex.org/keywords/stencil","display_name":"Stencil","score":0.39010000228881836},{"id":"https://openalex.org/keywords/xeon","display_name":"Xeon","score":0.33320000767707825},{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.33149999380111694}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.75},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6714000105857849},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.6606000065803528},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.5275999903678894},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.46860000491142273},{"id":"https://openalex.org/C136085584","wikidata":"https://www.wikidata.org/wiki/Q910289","display_name":"Overlay","level":2,"score":0.45669999718666077},{"id":"https://openalex.org/C17020691","wikidata":"https://www.wikidata.org/wiki/Q139677","display_name":"Operator (biology)","level":5,"score":0.45649999380111694},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.39739999175071716},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.39149999618530273},{"id":"https://openalex.org/C76752949","wikidata":"https://www.wikidata.org/wiki/Q7607499","display_name":"Stencil","level":2,"score":0.39010000228881836},{"id":"https://openalex.org/C145108525","wikidata":"https://www.wikidata.org/wiki/Q656154","display_name":"Xeon","level":2,"score":0.33320000767707825},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.33149999380111694},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.3305000066757202},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.32260000705718994},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3107999861240387},{"id":"https://openalex.org/C134765980","wikidata":"https://www.wikidata.org/wiki/Q879126","display_name":"Bitwise operation","level":2,"score":0.301800012588501},{"id":"https://openalex.org/C194080101","wikidata":"https://www.wikidata.org/wiki/Q46306","display_name":"Access time","level":2,"score":0.2838999927043915},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.28349998593330383},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.2797999978065491},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.27489998936653137},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C150552126","wikidata":"https://www.wikidata.org/wiki/Q339387","display_name":"SIMD","level":2,"score":0.27070000767707825},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.2705000042915344},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.26750001311302185},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C47487241","wikidata":"https://www.wikidata.org/wiki/Q5227230","display_name":"Data access","level":2,"score":0.25270000100135803},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/dac63849.2025.11132895","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11132895","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W2606722458","https://openalex.org/W2949591530","https://openalex.org/W2979455536","https://openalex.org/W3008408165","https://openalex.org/W3017024317","https://openalex.org/W3034100311","https://openalex.org/W3130240120","https://openalex.org/W3130554079","https://openalex.org/W3159727696","https://openalex.org/W3189877953","https://openalex.org/W3206749822","https://openalex.org/W3207622241","https://openalex.org/W3212505503","https://openalex.org/W4200122815","https://openalex.org/W4214686755","https://openalex.org/W4308083513","https://openalex.org/W4308479898","https://openalex.org/W4321637192","https://openalex.org/W4360831786","https://openalex.org/W4360832001","https://openalex.org/W4380881077","https://openalex.org/W4386065887","https://openalex.org/W4388214742","https://openalex.org/W4393578753","https://openalex.org/W4401414720","https://openalex.org/W4409282403","https://openalex.org/W4409285540","https://openalex.org/W4409285593"],"related_works":[],"abstract_inverted_index":{"State-space":[0],"models":[1],"(SSMs),":[2],"such":[3],"as":[4,8],"Mamba,":[5],"have":[6],"emerged":[7],"a":[9,67,84,118,123,137],"promising":[10],"alternative":[11],"to":[12,57,61,108,141,157,168],"Transformers.":[13],"However,":[14],"the":[15,100],"recently":[16],"developed":[17],"Mamba2,":[18],"based":[19],"on":[20],"state":[21],"space":[22],"duality":[23],"(SSD),":[24],"is":[25],"highly":[26],"memorybound":[27],"and":[28,43,80,103,160,166,171,183],"suffers":[29],"from":[30,37],"limited":[31],"computation":[32,101,105,111,147],"efficiency.":[33,106],"This":[34],"inefficiency":[35],"arises":[36],"its":[38],"irregular":[39],"broadcast":[40,78],"element-wise":[41],"multiplications":[42],"structured":[44],"sparse":[45,110],"computations.":[46],"In":[47],"this":[48],"work,":[49],"we":[50,65,113,135],"propose":[51],"MambaOPU,":[52],"an":[53],"FPGA":[54],"overlay":[55],"processor,":[56],"accelerate":[58],"SSD.":[59],"First,":[60],"reduce":[62],"memory":[63],"overhead,":[64],"introduce":[66],"software-hardware":[68],"co-optimized":[69],"operator":[70,74,88],"fusion":[71],"framework.":[72],"Specifically,":[73],"merging":[75],"combines":[76],"adjacent":[77],"multiplication":[79,93],"summation":[81],"operations":[82,132],"into":[83,94],"single":[85],"descriptor,":[86],"while":[87],"backward":[89],"shifting":[90],"embeds":[91],"segment":[92],"subsequent":[95],"operations.":[96],"Both":[97],"techniques":[98],"shorten":[99],"path":[102],"improve":[104,142],"Second,":[107],"enhance":[109],"efficiency,":[112],"skip":[114],"zero-region":[115],"computations":[116],"using":[117],"tensor-reorder-and-group":[119],"algorithm":[120],"combined":[121],"with":[122,133],"sparse-predefined":[124],"data":[125,143],"fetcher.":[126],"Additionally,":[127],"since":[128],"Mamba":[129],"integrates":[130],"linear":[131],"SSD,":[134],"develop":[136],"reconfigurable":[138],"systolic":[139],"array":[140],"reuse":[144],"across":[145],"different":[146],"modes.":[148],"Extensive":[149],"experiment":[150],"results":[151],"demonstrate":[152],"that":[153],"MambaOPU":[154],"achieves":[155],"up":[156,167],"$1812":[158],"\\times$":[159,162,170,173],"$880.79":[161],"higher":[163,174],"normalized":[164],"throughput":[165],"$12908":[169],"$24.27":[172],"energy":[175],"efficiency":[176],"over":[177],"Intel":[178],"Xeon":[179],"Gold":[180],"6348":[181],"CPU":[182],"NVIDIA":[184],"A100":[185],"GPU,":[186],"respectively.":[187]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-10T00:00:00"}
