{"id":"https://openalex.org/W4409248688","doi":"https://doi.org/10.1109/hpca61900.2025.00113","title":"InstAttention: In-Storage Attention Offloading for Cost-Effective Long-Context LLM Inference","display_name":"InstAttention: In-Storage Attention Offloading for Cost-Effective Long-Context LLM Inference","publication_year":2025,"publication_date":"2025-03-01","ids":{"openalex":"https://openalex.org/W4409248688","doi":"https://doi.org/10.1109/hpca61900.2025.00113"},"language":"en","primary_location":{"id":"doi:10.1109/hpca61900.2025.00113","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca61900.2025.00113","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5011711693","display_name":"Xiurui Pan","orcid":"https://orcid.org/0000-0002-2528-2660"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xiurui Pan","raw_affiliation_strings":["Peking University"],"affiliations":[{"raw_affiliation_string":"Peking University","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019668670","display_name":"Erzhong Li","orcid":"https://orcid.org/0009-0004-4130-0912"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Endian Li","raw_affiliation_strings":["Peking University"],"affiliations":[{"raw_affiliation_string":"Peking University","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112014055","display_name":"Qiao Li","orcid":"https://orcid.org/0000-0002-3766-0681"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qiao Li","raw_affiliation_strings":["University of Electronic Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"University of Electronic Science and Technology of China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018381533","display_name":"Shengwen Liang","orcid":"https://orcid.org/0000-0001-8407-2594"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shengwen Liang","raw_affiliation_strings":["Institute of Computing Technology, Chinese Academy of Sciences"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111117324","display_name":"Yizhou Shan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yizhou Shan","raw_affiliation_strings":["Huawei Cloud"],"affiliations":[{"raw_affiliation_string":"Huawei Cloud","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015061573","display_name":"Ke Zhou","orcid":"https://orcid.org/0000-0002-2161-8796"},"institutions":[{"id":"https://openalex.org/I4210138186","display_name":"Wuhan National Laboratory for Optoelectronics","ror":"https://ror.org/03c9ncn37","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210138186"]},{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ke Zhou","raw_affiliation_strings":["Wuhan National Laboratory for Optoelectronics of Huazhong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Wuhan National Laboratory for Optoelectronics of Huazhong University of Science and Technology","institution_ids":["https://openalex.org/I4210138186","https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007062927","display_name":"Yingwei Luo","orcid":null},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yingwei Luo","raw_affiliation_strings":["Peking University"],"affiliations":[{"raw_affiliation_string":"Peking University","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100395178","display_name":"Xiaolin Wang","orcid":"https://orcid.org/0000-0003-4293-7523"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaolin Wang","raw_affiliation_strings":["Peking University"],"affiliations":[{"raw_affiliation_string":"Peking University","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101711826","display_name":"Jie Zhang","orcid":"https://orcid.org/0000-0001-9803-7140"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jie Zhang","raw_affiliation_strings":["Peking University"],"affiliations":[{"raw_affiliation_string":"Peking University","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5011711693"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":9.6991,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.98008141,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1510","last_page":"1525"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.978600025177002,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.978600025177002,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9732000231742859,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12702","display_name":"Brain Tumor Detection and Classification","score":0.9643999934196472,"subfield":{"id":"https://openalex.org/subfields/2808","display_name":"Neurology"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.757735013961792},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6821058988571167},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6233479976654053},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.3283613920211792},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.32163205742836},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2786390781402588}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.757735013961792},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6821058988571167},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6233479976654053},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3283613920211792},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.32163205742836},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2786390781402588},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpca61900.2025.00113","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca61900.2025.00113","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":63,"referenced_works":["https://openalex.org/W1608061514","https://openalex.org/W2006425633","https://openalex.org/W2010390358","https://openalex.org/W2097490816","https://openalex.org/W2149992413","https://openalex.org/W2219142176","https://openalex.org/W2607652657","https://openalex.org/W2820897267","https://openalex.org/W2963339397","https://openalex.org/W3041587974","https://openalex.org/W3043188845","https://openalex.org/W3109747395","https://openalex.org/W3155143326","https://openalex.org/W3178293493","https://openalex.org/W3208624098","https://openalex.org/W4214559301","https://openalex.org/W4220660812","https://openalex.org/W4280496502","https://openalex.org/W4281689324","https://openalex.org/W4281850905","https://openalex.org/W4321636575","https://openalex.org/W4383749405","https://openalex.org/W4385245566","https://openalex.org/W4385572068","https://openalex.org/W4385834084","https://openalex.org/W4387321091","https://openalex.org/W4392240262","https://openalex.org/W4392427708","https://openalex.org/W4393407046","https://openalex.org/W4394999019","https://openalex.org/W4399197955","https://openalex.org/W4400985603","https://openalex.org/W4401211592","https://openalex.org/W4402671659","https://openalex.org/W6684515133","https://openalex.org/W6765119170","https://openalex.org/W6773730542","https://openalex.org/W6804239712","https://openalex.org/W6810874553","https://openalex.org/W6811340617","https://openalex.org/W6850625674","https://openalex.org/W6850927664","https://openalex.org/W6852408377","https://openalex.org/W6852999659","https://openalex.org/W6853192989","https://openalex.org/W6857551316","https://openalex.org/W6857690716","https://openalex.org/W6859180420","https://openalex.org/W6859495724","https://openalex.org/W6860155063","https://openalex.org/W6860165519","https://openalex.org/W6860337933","https://openalex.org/W6861013483","https://openalex.org/W6861206314","https://openalex.org/W6861630750","https://openalex.org/W6861652536","https://openalex.org/W6861839547","https://openalex.org/W6862223741","https://openalex.org/W6862397249","https://openalex.org/W6862520094","https://openalex.org/W6862776294","https://openalex.org/W6868642155","https://openalex.org/W6869966162"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"The":[0,159],"widespread":[1],"of":[2,30,149,152],"Large":[3],"Language":[4],"Models":[5],"(LLMs)":[6],"marks":[7],"a":[8,37,96,132,177],"significant":[9,75],"milestone":[10],"in":[11,22,109],"generative":[12],"AI.":[13],"Nevertheless,":[14,71],"the":[15,27,31,41,69,103,125,145,156],"increasing":[16],"context":[17],"length":[18],"and":[19,67,112,165],"batch":[20],"size":[21],"offline":[23,64],"LLM":[24,98],"inference":[25,65,99,190],"escalate":[26],"memory":[28,56],"requirement":[29],"key-value":[32],"(KV)":[33],"cache,":[34],"which":[35,123],"imposes":[36],"huge":[38],"burden":[39],"on":[40],"GPU":[42,164],"VRAM,":[43],"especially":[44],"for":[45,63,176,188],"resource-constrained":[46],"scenarios":[47,66],"(e.g.,":[48],"edge":[49],"computing).":[50],"Several":[51],"cost-effective":[52],"solutions":[53,200],"leverage":[54],"host":[55],"or":[57],"SSDs":[58],"to":[59,85,118,143,193,197],"reduce":[60],"storage":[61],"costs":[62],"improve":[68],"throughput.":[70],"they":[72],"suffer":[73],"from":[74],"performance":[76],"penalties":[77],"imposed":[78],"by":[79,155,191],"intensive":[80],"KV":[81,115,127,139],"cache":[82,140],"accesses":[83],"due":[84],"limited":[86,154],"PCIe":[87,157],"bandwidth.":[88,158],"To":[89],"address":[90],"these":[91],"issues,":[92],"we":[93],"propose":[94],"InstAttention,":[95],"novel":[97],"system":[100],"that":[101,175],"offloads":[102],"most":[104],"performance-critical":[105],"computation":[106],"(i.e.,":[107,114],"attention":[108,136],"decoding":[110],"phase)":[111],"data":[113,169],"cache)":[116],"parts":[117],"Computational":[119],"Storage":[120],"Drives":[121],"(CSDs),":[122],"minimize":[124],"enormous":[126],"transfer":[128],"overheads.":[129,171],"InstAttention":[130,185],"designs":[131],"dedicated":[133],"flashaware":[134],"in-storage":[135],"engine":[137],"with":[138],"management":[141],"mechanisms":[142],"exploit":[144],"high":[146],"internal":[147],"bandwidths":[148],"CSDs":[150,166],"instead":[151],"being":[153],"optimized":[160],"P2P":[161],"transmission":[162],"between":[163],"further":[167],"reduces":[168],"migration":[170],"Experimental":[172],"results":[173],"demonstrate":[174],"13B":[178],"model":[179],"using":[180],"an":[181],"NVIDIA":[182],"A6000":[183],"GPU,":[184],"improves":[186],"throughput":[187],"long-sequence":[189],"up":[192],"$11.1":[194],"\\times$,":[195],"compared":[196],"existing":[198],"SSD-based":[199],"such":[201],"as":[202],"FlexGen.":[203]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":4}],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2025-10-10T00:00:00"}
