{"id":"https://openalex.org/W7164209693","doi":"https://doi.org/10.1109/fccm68464.2026.00050","title":"STEEL: Sparsity-Aware Fused Attention for Energy-Efficient Long-Sequence Inference on AMD\u2019s XDNA\u2122 NPU","display_name":"STEEL: Sparsity-Aware Fused Attention for Energy-Efficient Long-Sequence Inference on AMD\u2019s XDNA\u2122 NPU","publication_year":2026,"publication_date":"2026-05-13","ids":{"openalex":"https://openalex.org/W7164209693","doi":"https://doi.org/10.1109/fccm68464.2026.00050"},"language":null,"primary_location":{"id":"doi:10.1109/fccm68464.2026.00050","is_oa":false,"landing_page_url":"https://doi.org/10.1109/fccm68464.2026.00050","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE 34th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128771210","display_name":"Victor J. B. Jung","orcid":null},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Victor J.B. Jung","raw_affiliation_strings":["ETH Z&#x00FC;rich,Integrated Systems Laboratory (IIS),Switzerland"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ETH Z&#x00FC;rich,Integrated Systems Laboratory (IIS),Switzerland","institution_ids":["https://openalex.org/I35440088"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138315820","display_name":"Gagandeep Singh","orcid":null},"institutions":[{"id":"https://openalex.org/I4210121557","display_name":"Advanced Media Research (United States)","ror":"https://ror.org/02kgjbk95","country_code":"US","type":"company","lineage":["https://openalex.org/I4210121557"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Gagandeep Singh","raw_affiliation_strings":["AMD Research and Advanced Development (RAD)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"AMD Research and Advanced Development (RAD)","institution_ids":["https://openalex.org/I4210121557"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006148629","display_name":"Joseph Melber","orcid":"https://orcid.org/0000-0001-9519-0502"},"institutions":[{"id":"https://openalex.org/I4210121557","display_name":"Advanced Media Research (United States)","ror":"https://ror.org/02kgjbk95","country_code":"US","type":"company","lineage":["https://openalex.org/I4210121557"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Joseph Melber","raw_affiliation_strings":["AMD Research and Advanced Development (RAD)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"AMD Research and Advanced Development (RAD)","institution_ids":["https://openalex.org/I4210121557"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035560671","display_name":"Kristof Denolf","orcid":"https://orcid.org/0000-0001-6668-4562"},"institutions":[{"id":"https://openalex.org/I4210121557","display_name":"Advanced Media Research (United States)","ror":"https://ror.org/02kgjbk95","country_code":"US","type":"company","lineage":["https://openalex.org/I4210121557"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kristof Denolf","raw_affiliation_strings":["AMD Research and Advanced Development (RAD)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"AMD Research and Advanced Development (RAD)","institution_ids":["https://openalex.org/I4210121557"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089543048","display_name":"Francesco Conti","orcid":null},"institutions":[{"id":"https://openalex.org/I9360294","display_name":"University of Bologna","ror":"https://ror.org/01111rn36","country_code":"IT","type":"education","lineage":["https://openalex.org/I9360294"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Francesco Conti","raw_affiliation_strings":["University of Bologna,Department of Electrical, Electronic and Information Engineering (DEI),Italy"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Bologna,Department of Electrical, Electronic and Information Engineering (DEI),Italy","institution_ids":["https://openalex.org/I9360294"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043408422","display_name":"Luca Benini","orcid":"https://orcid.org/0000-0001-8068-3806"},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Luca Benini","raw_affiliation_strings":["ETH Z&#x00FC;rich,Integrated Systems Laboratory (IIS),Switzerland"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ETH Z&#x00FC;rich,Integrated Systems Laboratory (IIS),Switzerland","institution_ids":["https://openalex.org/I35440088"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.89080033,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"262","last_page":"262"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.16979999840259552,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10015","display_name":"Genomics and Phylogenetic Studies","score":0.16979999840259552,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.09929999709129333,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.07339999824762344,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.40939998626708984},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.2946000099182129},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.29100000858306885},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.2515000104904175},{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.24320000410079956}],"concepts":[{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5388000011444092},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5374000072479248},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.40939998626708984},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2946000099182129},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.29100000858306885},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.28839999437332153},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2662999927997589},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.2515000104904175},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.24320000410079956},{"id":"https://openalex.org/C25343380","wikidata":"https://www.wikidata.org/wiki/Q277521","display_name":"Relation (database)","level":2,"score":0.2401999980211258}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/fccm68464.2026.00050","is_oa":false,"landing_page_url":"https://doi.org/10.1109/fccm68464.2026.00050","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE 34th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.8111394643783569,"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":1,"referenced_works":["https://openalex.org/W4386859272"],"related_works":[],"abstract_inverted_index":{"THE":[0],"growing":[1],"integration":[2],"of":[3],"Transformer-based":[4],"artificial":[5],"intelligence":[6],"(AI)":[7],"agents":[8],"into":[9],"core":[10],"operating":[11],"system":[12],"functions":[13],"is":[14,43,63],"a":[15,44,64],"key":[16],"driver":[17],"in":[18],"modern":[19],"laptop":[20],"systems-on-chip":[21],"(SoCs)":[22],"design.":[23],"While":[24,132],"enabling":[25],"powerful":[26],"capabilities,":[27],"their":[28],"inference":[29,68],"incurs":[30],"significant":[31,77],"compute":[32],"and":[33,54,70,87,99,118,129],"data-movement":[34,120],"overhead,":[35],"making":[36],"them":[37],"highly":[38],"energy-intensive.":[39],"This":[40],"energy":[41,71,112],"cost":[42],"fundamental":[45],"bottleneck":[46],"for":[47,141,152],"embedded":[48],"mobile":[49],"platforms":[50,89],"with":[51],"tight":[52],"power":[53],"thermal":[55],"constraints":[56],"[2]":[57],".":[58],"The":[59],"Attention":[60],"prefill":[61],"stage":[62],"major":[65],"contributor":[66],"to":[67],"latency":[69],"at":[72],"long":[73],"sequence":[74],"lengths.":[75],"Consequently,":[76],"effort":[78],"has":[79,136],"focused":[80,137],"on":[81,138],"optimizing":[82,139],"attention":[83,140,151],"across":[84],"commercial":[85],"[3]":[86,98],"academic":[88],"[4]":[90],",":[91],"spanning":[92],"algorithmic":[93],"advances":[94],"such":[95],"as":[96],"FlashAttention":[97],"hardware":[100],"enhancements":[101],"including":[102],"specialized":[103],"non-linear":[104],"units.":[105],"Neural":[106],"processing":[107,143],"units":[108,144],"(NPUs)":[109],"achieve":[110],"high":[111],"efficiency":[113],"through":[114],"spatial":[115],"dataflow":[116],"architectures":[117],"explicit":[119],"programming":[121],"models,":[122],"which":[123],"expose":[124],"fine-grained":[125],"control":[126],"over":[127],"computation":[128],"memory":[130],"transfers.":[131],"extensive":[133],"prior":[134],"work":[135],"graphics":[142],"(GPUs),":[145],"comparatively":[146],"few":[147],"efforts":[148],"have":[149],"targeted":[150],"NPUs.":[153]},"counts_by_year":[],"updated_date":"2026-06-12T06:20:11.936012","created_date":"2026-06-11T00:00:00"}
