{"id":"https://openalex.org/W4417403179","doi":"https://doi.org/10.1109/pact65351.2025.00034","title":"Fine-Grained Fusion: The Missing Piece in Area-Efficient State Space Model Acceleration","display_name":"Fine-Grained Fusion: The Missing Piece in Area-Efficient State Space Model Acceleration","publication_year":2025,"publication_date":"2025-11-03","ids":{"openalex":"https://openalex.org/W4417403179","doi":"https://doi.org/10.1109/pact65351.2025.00034"},"language":null,"primary_location":{"id":"doi:10.1109/pact65351.2025.00034","is_oa":false,"landing_page_url":"https://doi.org/10.1109/pact65351.2025.00034","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 34th International Conference on Parallel Architectures and Compilation Techniques (PACT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5065013150","display_name":"Robin Geens","orcid":null},"institutions":[{"id":"https://openalex.org/I99464096","display_name":"KU Leuven","ror":"https://ror.org/05f950310","country_code":"BE","type":"education","lineage":["https://openalex.org/I99464096"]}],"countries":["BE"],"is_corresponding":true,"raw_author_name":"Robin Geens","raw_affiliation_strings":["MICAS, KU Leuven,Leuven,Belgium"],"affiliations":[{"raw_affiliation_string":"MICAS, KU Leuven,Leuven,Belgium","institution_ids":["https://openalex.org/I99464096"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046786058","display_name":"Arne Symons","orcid":"https://orcid.org/0000-0002-4595-9384"},"institutions":[{"id":"https://openalex.org/I99464096","display_name":"KU Leuven","ror":"https://ror.org/05f950310","country_code":"BE","type":"education","lineage":["https://openalex.org/I99464096"]}],"countries":["BE"],"is_corresponding":false,"raw_author_name":"Arne Symons","raw_affiliation_strings":["MICAS, KU Leuven,Leuven,Belgium"],"affiliations":[{"raw_affiliation_string":"MICAS, KU Leuven,Leuven,Belgium","institution_ids":["https://openalex.org/I99464096"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5012150553","display_name":"Marian Verhelst","orcid":"https://orcid.org/0000-0003-3495-9263"},"institutions":[{"id":"https://openalex.org/I99464096","display_name":"KU Leuven","ror":"https://ror.org/05f950310","country_code":"BE","type":"education","lineage":["https://openalex.org/I99464096"]}],"countries":["BE"],"is_corresponding":false,"raw_author_name":"Marian Verhelst","raw_affiliation_strings":["MICAS, KU Leuven,Leuven,Belgium"],"affiliations":[{"raw_affiliation_string":"MICAS, KU Leuven,Leuven,Belgium","institution_ids":["https://openalex.org/I99464096"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5065013150"],"corresponding_institution_ids":["https://openalex.org/I99464096"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.433354,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"281","last_page":"291"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.5985000133514404,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.5985000133514404,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.13850000500679016,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.055399999022483826,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.8465999960899353},{"id":"https://openalex.org/keywords/hardware-acceleration","display_name":"Hardware acceleration","score":0.5620999932289124},{"id":"https://openalex.org/keywords/design-space-exploration","display_name":"Design space exploration","score":0.5437999963760376},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.5292999744415283},{"id":"https://openalex.org/keywords/operator","display_name":"Operator (biology)","score":0.4226999878883362},{"id":"https://openalex.org/keywords/locality","display_name":"Locality","score":0.4221000075340271},{"id":"https://openalex.org/keywords/acceleration","display_name":"Acceleration","score":0.4009000062942505},{"id":"https://openalex.org/keywords/state-space","display_name":"State space","score":0.3898000121116638}],"concepts":[{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.8465999960899353},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6541000008583069},{"id":"https://openalex.org/C13164978","wikidata":"https://www.wikidata.org/wiki/Q600158","display_name":"Hardware acceleration","level":3,"score":0.5620999932289124},{"id":"https://openalex.org/C2776221188","wikidata":"https://www.wikidata.org/wiki/Q21072556","display_name":"Design space exploration","level":2,"score":0.5437999963760376},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.5292999744415283},{"id":"https://openalex.org/C17020691","wikidata":"https://www.wikidata.org/wiki/Q139677","display_name":"Operator (biology)","level":5,"score":0.4226999878883362},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.4221000075340271},{"id":"https://openalex.org/C117896860","wikidata":"https://www.wikidata.org/wiki/Q11376","display_name":"Acceleration","level":2,"score":0.4009000062942505},{"id":"https://openalex.org/C72434380","wikidata":"https://www.wikidata.org/wiki/Q230930","display_name":"State space","level":2,"score":0.3898000121116638},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.38429999351501465},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3723999857902527},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.3560999929904938},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.33820000290870667},{"id":"https://openalex.org/C134765980","wikidata":"https://www.wikidata.org/wiki/Q879126","display_name":"Bitwise operation","level":2,"score":0.3240000009536743},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.32109999656677246},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.31929999589920044},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.31310001015663147},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.30140000581741333},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3005000054836273},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.2922999858856201},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.27230000495910645},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.2637999951839447},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.26249998807907104},{"id":"https://openalex.org/C68028875","wikidata":"https://www.wikidata.org/wiki/Q7572595","display_name":"Space mapping","level":2,"score":0.26100000739097595},{"id":"https://openalex.org/C104060986","wikidata":"https://www.wikidata.org/wiki/Q180046","display_name":"Space exploration","level":2,"score":0.25369998812675476},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.25189998745918274}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/pact65351.2025.00034","is_oa":false,"landing_page_url":"https://doi.org/10.1109/pact65351.2025.00034","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 34th International Conference on Parallel Architectures and Compilation Techniques (PACT)","raw_type":"proceedings-article"},{"id":"pmh:oai:lirias2repo.kuleuven.be:20.500.12942/781869","is_oa":false,"landing_page_url":"https://lirias.kuleuven.be/handle/20.500.12942/781869","pdf_url":null,"source":{"id":"https://openalex.org/S4306401954","display_name":"Lirias (KU Leuven)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I99464096","host_organization_name":"KU Leuven","host_organization_lineage":["https://openalex.org/I99464096"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":"International Conference on Parallel Architectures and Compilation Techniques (PACT), Irvine, CA, USA, 3-6 November 2025","raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320334678","display_name":"European Research Council","ror":"https://ror.org/0472cxd90"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"State":[0,192],"Space":[1,193],"Models":[2],"(SSMs)":[3],"offer":[4],"a":[5,28,36,41,107,143,169],"promising":[6],"alternative":[7],"to":[8,32,111],"transformers":[9],"for":[10,172],"long-sequence":[11],"processing.":[12],"However,":[13],"their":[14],"efficiency":[15],"remains":[16],"hindered":[17],"by":[18,127],"memory-bound":[19],"operations,":[20],"particularly":[21],"in":[22,190],"the":[23,50,66,74,86,94,154,159],"prefill":[24],"stage.":[25],"While":[26],"MARCA,":[27],"recent":[29],"first":[30],"effort":[31],"accelerate":[33],"SSMs":[34],"through":[35,69,77],"dedicated":[37],"hardware":[38,75,145],"accelerator,":[39,157],"achieves":[40],"great":[42],"speedup":[43,108],"over":[44,114],"high-end":[45],"GPUs,":[46],"an":[47,82,128],"analysis":[48],"of":[49,85,109,130,198],"broader":[51],"accelerator":[52,138],"design":[53,78,139],"space":[54,79],"is":[55],"lacking.":[56],"This":[57],"work":[58],"systematically":[59],"analyzes":[60],"SSM":[61,174],"acceleration":[62],"opportunities":[63],"from":[64,99],"both":[65],"scheduling":[67,104],"perspective,":[68,76],"fine-grained":[70],"operator":[71,166],"fusion,":[72],"and":[73,103,181],"exploration,":[80],"using":[81],"extended":[83],"version":[84],"Stream":[87],"modeling":[88],"framework.":[89],"Our":[90],"results":[91,164],"demonstrate":[92],"that":[93,142],"improved":[95],"data":[96],"locality":[97],"stemming":[98],"our":[100,118],"optimized":[101],"fusion":[102,121,167],"strategy":[105],"enables":[106],"up":[110],"$4.8":[112],"\\times$":[113,150],"unfused":[115],"execution,":[116],"while":[117],"adaptive":[119],"memory-aware":[120],"approach":[122],"reduces":[123],"on-chip":[124],"memory":[125],"requirements":[126],"order":[129],"magnitude":[131],"without":[132],"sacrificing":[133],"performance.":[134],"We":[135],"further":[136],"explore":[137],"trade-offs,":[140],"showing":[141],"fusion-aware":[144],"architecture":[146],"can":[147],"achieve":[148],"$1.78":[149],"higher":[151],"performance":[152],"than":[153],"state-of-the-art":[155],"MARCA":[156],"within":[158],"same":[160],"area":[161],"budget.":[162],"These":[163],"establish":[165],"as":[168],"key":[170],"enabler":[171],"next-generation":[173],"accelerators.ACM":[175],"Reference":[176],"Format:Robin":[177],"Geens,":[178],"Arne":[179],"Symons,":[180],"Marian":[182],"Verhelst.":[183],"2025.":[184],"Fine-Grained":[185],"Fusion:":[186],"The":[187],"Missing":[188],"Piece":[189],"Area-Efficient":[191],"Model":[194],"Acceleration.":[195],"In":[196],"Proceedings":[197],"(PACT":[199],"\u201925).":[200],"ACM,":[201],"New":[202],"York,":[203],"NY,":[204],"USA,":[205],"11":[206],"pages.":[207],"https:":[208],"//doi.org/XXXXXXX.XXXXXXX":[209]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-12-16T00:00:00"}
