{"id":"https://openalex.org/W7117158700","doi":"https://doi.org/10.1109/tc.2025.3648055","title":"Designing Spatial Architectures for Sparse Attention: STAR Accelerator via Cross-Stage Tiling","display_name":"Designing Spatial Architectures for Sparse Attention: STAR Accelerator via Cross-Stage Tiling","publication_year":2025,"publication_date":"2025-12-24","ids":{"openalex":"https://openalex.org/W7117158700","doi":"https://doi.org/10.1109/tc.2025.3648055"},"language":null,"primary_location":{"id":"doi:10.1109/tc.2025.3648055","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tc.2025.3648055","pdf_url":null,"source":{"id":"https://openalex.org/S157670870","display_name":"IEEE Transactions on Computers","issn_l":"0018-9340","issn":["0018-9340","1557-9956","2326-3814"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Computers","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5089405414","display_name":"Huizheng Wang","orcid":"https://orcid.org/0000-0002-9763-8208"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huizheng Wang","raw_affiliation_strings":["School of Integrated Circuits, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-9763-8208","affiliations":[{"raw_affiliation_string":"School of Integrated Circuits, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Taiquan Wei","orcid":"https://orcid.org/0009-0005-3501-3148"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Taiquan Wei","raw_affiliation_strings":["School of Integrated Circuits, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0005-3501-3148","affiliations":[{"raw_affiliation_string":"School of Integrated Circuits, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121176129","display_name":"Hongbin Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongbin Wang","raw_affiliation_strings":["School of Integrated Circuits, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0003-2383-8084","affiliations":[{"raw_affiliation_string":"School of Integrated Circuits, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121146620","display_name":"Zichuan Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zichuan Wang","raw_affiliation_strings":["School of Integrated Circuits, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0001-7114-0647","affiliations":[{"raw_affiliation_string":"School of Integrated Circuits, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103213494","display_name":"Xinru Tang","orcid":"https://orcid.org/0009-0004-6038-3709"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinru Tang","raw_affiliation_strings":["School of Integrated Circuits, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0004-6038-3709","affiliations":[{"raw_affiliation_string":"School of Integrated Circuits, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009692777","display_name":"Zhiheng Yue","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiheng Yue","raw_affiliation_strings":["School of Integrated Circuits, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-4084-3478","affiliations":[{"raw_affiliation_string":"School of Integrated Circuits, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121127715","display_name":"Shaojun Wei","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shaojun Wei","raw_affiliation_strings":["School of Integrated Circuits, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-5117-7920","affiliations":[{"raw_affiliation_string":"School of Integrated Circuits, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yang Hu","orcid":"https://orcid.org/0000-0001-6942-4395"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yang Hu","raw_affiliation_strings":["School of Integrated Circuits, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-6942-4395","affiliations":[{"raw_affiliation_string":"School of Integrated Circuits, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":null,"display_name":"Shouyi Yin","orcid":"https://orcid.org/0000-0003-2309-572X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shouyi Yin","raw_affiliation_strings":["School of Integrated Circuits, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-2309-572X","affiliations":[{"raw_affiliation_string":"School of Integrated Circuits, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.59868154,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"75","issue":"3","first_page":"1125","last_page":"1140"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.49799999594688416,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.49799999594688416,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.14429999887943268,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.06620000302791595,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.766700029373169},{"id":"https://openalex.org/keywords/dataflow","display_name":"Dataflow","score":0.7335000038146973},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5396000146865845},{"id":"https://openalex.org/keywords/efficient-energy-use","display_name":"Efficient energy use","score":0.5306000113487244},{"id":"https://openalex.org/keywords/memory-footprint","display_name":"Memory footprint","score":0.4575999975204468},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.39739999175071716},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.3824000060558319},{"id":"https://openalex.org/keywords/energy-consumption","display_name":"Energy consumption","score":0.36399999260902405},{"id":"https://openalex.org/keywords/granularity","display_name":"Granularity","score":0.3409000039100647}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8673999905586243},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.766700029373169},{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.7335000038146973},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5471000075340271},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5396000146865845},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.5306000113487244},{"id":"https://openalex.org/C74912251","wikidata":"https://www.wikidata.org/wiki/Q6815727","display_name":"Memory footprint","level":2,"score":0.4575999975204468},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.39739999175071716},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.3824000060558319},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.36399999260902405},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.3409000039100647},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.33719998598098755},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3359000086784363},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.33489999175071716},{"id":"https://openalex.org/C13164978","wikidata":"https://www.wikidata.org/wiki/Q600158","display_name":"Hardware acceleration","level":3,"score":0.3208000063896179},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.31619998812675476},{"id":"https://openalex.org/C22684755","wikidata":"https://www.wikidata.org/wiki/Q847526","display_name":"Queueing theory","level":2,"score":0.29440000653266907},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2854999899864197},{"id":"https://openalex.org/C111696304","wikidata":"https://www.wikidata.org/wiki/Q2303697","display_name":"Sorting","level":2,"score":0.2768000066280365},{"id":"https://openalex.org/C186370098","wikidata":"https://www.wikidata.org/wiki/Q442787","display_name":"Energy (signal processing)","level":2,"score":0.274399995803833},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.272599995136261},{"id":"https://openalex.org/C134652429","wikidata":"https://www.wikidata.org/wiki/Q1052698","display_name":"Jitter","level":2,"score":0.2711000144481659},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.2669999897480011},{"id":"https://openalex.org/C2780897414","wikidata":"https://www.wikidata.org/wiki/Q7600592","display_name":"Star (game theory)","level":2,"score":0.2655999958515167},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.26249998807907104},{"id":"https://openalex.org/C37404715","wikidata":"https://www.wikidata.org/wiki/Q380679","display_name":"Dynamic programming","level":2,"score":0.25780001282691956},{"id":"https://openalex.org/C2776834041","wikidata":"https://www.wikidata.org/wiki/Q25346349","display_name":"Execution model","level":2,"score":0.25270000100135803}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tc.2025.3648055","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tc.2025.3648055","pdf_url":null,"source":{"id":"https://openalex.org/S157670870","display_name":"IEEE Transactions on Computers","issn_l":"0018-9340","issn":["0018-9340","1557-9956","2326-3814"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Computers","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7","score":0.8967840671539307}],"awards":[{"id":"https://openalex.org/G1187591493","display_name":null,"funder_award_id":"2022ZD0115200","funder_id":"https://openalex.org/F4320329860","funder_display_name":"National Science and Technology Major Project"},{"id":"https://openalex.org/G3322336741","display_name":null,"funder_award_id":"U24A20234","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G338081660","display_name":null,"funder_award_id":"62125403","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5227578763","display_name":null,"funder_award_id":"92464302","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8200624584","display_name":null,"funder_award_id":"U24B20164","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320329777","display_name":"Beijing National Research Center For Information Science And Technology","ror":null},{"id":"https://openalex.org/F4320329860","display_name":"National Science and Technology Major Project","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W1988113008","https://openalex.org/W2002555321","https://openalex.org/W2034861439","https://openalex.org/W2913573286","https://openalex.org/W2963122961","https://openalex.org/W2980104813","https://openalex.org/W3016904661","https://openalex.org/W3017024317","https://openalex.org/W3047848469","https://openalex.org/W3131500599","https://openalex.org/W3158831985","https://openalex.org/W3159727696","https://openalex.org/W3189877953","https://openalex.org/W3190761184","https://openalex.org/W3205192296","https://openalex.org/W3206453033","https://openalex.org/W4211076402","https://openalex.org/W4214686755","https://openalex.org/W4297097426","https://openalex.org/W4319988867","https://openalex.org/W4360831795","https://openalex.org/W4380881077","https://openalex.org/W4381886086","https://openalex.org/W4392450088","https://openalex.org/W4392567268","https://openalex.org/W4404954358","https://openalex.org/W4413017445","https://openalex.org/W7092191477"],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"(LLMs)":[3],"rely":[4],"on":[5],"self\u2013attention":[6],"for":[7,62,104,159],"contextual":[8],"understanding,":[9],"demanding":[10],"high-throughput":[11],"inference":[12,64],"and":[13,48,57,87,108,125,131,140,156],"large\u2013scale":[14],"token":[15],"parallelism":[16],"(LTPP).":[17],"Existing":[18],"dynamic":[19],"sparsity":[20,33,71],"accelerators":[21,134],"falter":[22],"under":[23,65],"LTPP":[24],"scenarios":[25],"due":[26],"to":[27,78,122,137,168],"stage-isolated":[28],"optimizations.":[29],"Revisiting":[30],"the":[31,169],"end-to-end":[32],"acceleration":[34],"flow,":[35],"we":[36,146],"identify":[37],"an":[38],"overlooked":[39],"opportunity:":[40],"crossstage":[41],"coordination":[42],"can":[43],"substantially":[44],"reduce":[45],"redundant":[46],"computation":[47],"memory":[49,106],"access.":[50],"We":[51],"propose":[52],"STAR,":[53],"a":[54,69,88,95,115,150,174],"cross-stage":[55],"computetation":[56],"memory\u2013efficient":[58],"algorithm\u2013hardware":[59],"co-design":[60],"tailored":[61],"Transformer":[63],"LTPP.":[66],"STAR":[67,117,148],"introduces":[68],"leading-zero-based":[70],"prediction":[72,80],"using":[73],"log-domain":[74],"add":[75],"only":[76],"operations":[77],"minimize":[79],"overhead.":[81],"It":[82],"further":[83],"employs":[84],"distributed":[85],"sorting":[86],"sorted":[89],"updating":[90],"FlashAttention":[91],"mechanism,":[92],"guided":[93],"by":[94,114,135],"coordinated":[96],"tiling":[97],"strategy":[98],"that":[99],"enables":[100],"fine-grained":[101],"stage":[102],"interaction":[103],"improved":[105],"efficiency":[107,128,143],"latency.":[109],"These":[110],"optimizations":[111],"are":[112],"supported":[113],"dedicated":[116],"accelerator":[118],"architecture,":[119,153],"achieving":[120],"up":[121,136],"9.2\u00d7":[123],"speedup":[124],"71.2\u00d7":[126],"energy":[127,139],"over":[129],"A100,":[130],"surpassing":[132],"SOTA":[133],"16.1\u00d7":[138],"27.1\u00d7":[141],"area":[142],"gains.":[144],"Further,":[145],"deploy":[147],"onto":[149],"multi-core":[151],"spatial":[152],"optimizing":[154],"dataflow":[155],"execution":[157],"orchestration":[158],"ultra-long":[160],"sequence":[161],"processing.":[162],"Architectural":[163],"evaluation":[164],"shows":[165],"that,":[166],"compared":[167],"baseline":[170],"design,":[171],"Spatial-STAR":[172],"achieves":[173],"20.1\u00d7":[175],"throughput":[176],"improvement.":[177]},"counts_by_year":[],"updated_date":"2026-06-26T08:34:08.712188","created_date":"2025-12-24T00:00:00"}
