{"id":"https://openalex.org/W7158571394","doi":"https://doi.org/10.48550/arxiv.2604.26837","title":"Unifying Sparse Attention with Hierarchical Memory for Scalable Long-Context LLM Serving","display_name":"Unifying Sparse Attention with Hierarchical Memory for Scalable Long-Context LLM Serving","publication_year":2026,"publication_date":"2026-04-29","ids":{"openalex":"https://openalex.org/W7158571394","doi":"https://doi.org/10.48550/arxiv.2604.26837"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.26837","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26837","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.26837","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134883214","display_name":"Zihan Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Zihan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038360007","display_name":"Baotong Lu","orcid":"https://orcid.org/0000-0002-0230-1048"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Baotong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134917259","display_name":"Shengjie Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Shengjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134882904","display_name":"Yizou Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yizou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134903336","display_name":"Jing Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Jing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134897803","display_name":"Yanqi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yanqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090438796","display_name":"Ziming Miao","orcid":"https://orcid.org/0000-0001-7466-2128"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Miao, Ziming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102930563","display_name":"Ming-Chang Yang","orcid":"https://orcid.org/0000-0002-4029-757X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Ming-Chang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134893028","display_name":"Haiying Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Haiying","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134906765","display_name":"Qi Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Qi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134893172","display_name":"Fan Yang","orcid":"https://orcid.org/0000-0001-9390-2270"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Fan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.6438999772071838,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.6438999772071838,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.16380000114440918,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.02979999966919422,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7064999938011169},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.5329999923706055},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4860999882221222},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4645000100135803},{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.46219998598098755},{"id":"https://openalex.org/keywords/sparse-matrix","display_name":"Sparse matrix","score":0.4357999861240387},{"id":"https://openalex.org/keywords/partition","display_name":"Partition (number theory)","score":0.43529999256134033},{"id":"https://openalex.org/keywords/pci-express","display_name":"PCI Express","score":0.4311000108718872},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.42649999260902405}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8158000111579895},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7064999938011169},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.5329999923706055},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4860999882221222},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4648999869823456},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4645000100135803},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.46219998598098755},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.4357999861240387},{"id":"https://openalex.org/C42812","wikidata":"https://www.wikidata.org/wiki/Q1082910","display_name":"Partition (number theory)","level":2,"score":0.43529999256134033},{"id":"https://openalex.org/C64270927","wikidata":"https://www.wikidata.org/wiki/Q206924","display_name":"PCI Express","level":3,"score":0.4311000108718872},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.42649999260902405},{"id":"https://openalex.org/C133320665","wikidata":"https://www.wikidata.org/wiki/Q179299","display_name":"RAID","level":2,"score":0.4124999940395355},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.4124000072479248},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.39959999918937683},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.38749998807907104},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.38119998574256897},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.34549999237060547},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3125},{"id":"https://openalex.org/C144986985","wikidata":"https://www.wikidata.org/wiki/Q871236","display_name":"Hierarchical database model","level":2,"score":0.3111000061035156},{"id":"https://openalex.org/C46743427","wikidata":"https://www.wikidata.org/wiki/Q1341685","display_name":"Inference engine","level":3,"score":0.30219998955726624},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.29789999127388},{"id":"https://openalex.org/C101722063","wikidata":"https://www.wikidata.org/wiki/Q218825","display_name":"Random access","level":2,"score":0.2953000068664551},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.2921999990940094},{"id":"https://openalex.org/C193702766","wikidata":"https://www.wikidata.org/wiki/Q1414548","display_name":"Concurrency","level":2,"score":0.2750999927520752},{"id":"https://openalex.org/C175309249","wikidata":"https://www.wikidata.org/wiki/Q725864","display_name":"Pipeline transport","level":2,"score":0.2702000141143799},{"id":"https://openalex.org/C136968285","wikidata":"https://www.wikidata.org/wiki/Q3246374","display_name":"Set partitioning in hierarchical trees","level":5,"score":0.26739999651908875},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.25690001249313354},{"id":"https://openalex.org/C134765980","wikidata":"https://www.wikidata.org/wiki/Q879126","display_name":"Bitwise operation","level":2,"score":0.2535000145435333},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.251800000667572}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.26837","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26837","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.26837","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26837","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Long-context":[0],"LLM":[1],"serving":[2],"is":[3],"bottlenecked":[4],"by":[5,19,198],"the":[6,27,35,70,87,93,106,164,170,203],"cost":[7],"of":[8,26,95],"attending":[9],"over":[10,202],"ever-growing":[11],"KV":[12,28,36,74,84,111,130,135],"caches.":[13],"Dynamic":[14],"sparse":[15,54,180],"attention":[16,181],"promises":[17],"relief":[18],"accessing":[20],"only":[21],"a":[22,77,100,117,127,133,146,157],"small,":[23],"query-dependent":[24],"subset":[25],"state":[29],"per":[30],"decoding":[31],"step":[32],"and":[33,61,144,155,189,195],"extending":[34],"storage":[37,75,112],"to":[38,151,163,200],"CPU":[39],"memory.":[40],"In":[41],"practice,":[42],"however,":[43],"these":[44],"algorithmic":[45],"savings":[46],"rarely":[47],"translate":[48],"into":[49],"end-to-end":[50,187],"system-level":[51],"gains":[52],"because":[53],"methods":[55],"typically":[56],"operate":[57],"at":[58],"different":[59,123],"granularities":[60,125],"thus":[62],"rely":[63],"on":[64,175],"ad":[65],"hoc,":[66],"per-algorithm":[67],"implementations.":[68,206],"At":[69],"same":[71],"time,":[72],"hierarchical":[73,110,159],"introduces":[76],"new":[78],"systems":[79],"bottleneck:":[80],"retrieving":[81],"fine-grained,":[82],"irregular":[83],"subsets":[85],"across":[86],"GPU-CPU":[88],"boundary":[89],"can":[90],"easily":[91],"erase":[92],"benefits":[94],"sparsity.":[96],"We":[97],"present":[98],"SPIN,":[99],"sparse-attention-aware":[101],"inference":[102],"framework":[103],"that":[104,121,138],"co-designs":[105],"execution":[107],"pipeline":[108],"with":[109,177],"through":[113],"three":[114,178],"techniques:":[115],"(1)":[116],"unified":[118],"partition":[119],"abstraction":[120],"maps":[122],"sparsity":[124],"onto":[126],"shared":[128],"page-based":[129],"substrate;":[131],"(2)":[132],"locality-aware":[134],"cache":[136],"manager":[137],"dynamically":[139],"sizes":[140],"per-request":[141],"HBM":[142],"budgets":[143],"uses":[145],"GPU-friendly":[147],"bucketed":[148],"LRU":[149],"policy":[150],"cut":[152],"PCIe":[153],"round-trips;":[154],"(3)":[156],"two-level":[158],"metadata":[160],"layout":[161],"sized":[162],"active":[165],"working":[166],"set":[167],"rather":[168],"than":[169,193],"worst-case":[171],"address":[172],"space.":[173],"Built":[174],"vLLM":[176],"representative":[179],"algorithms,":[182],"SPIN":[183],"delivers":[184],"1.66-5.66x":[185],"higher":[186],"throughput":[188],"7-9x":[190],"lower":[191],"TTFT":[192],"vLLM,":[194],"reduces":[196],"TPOT":[197],"up":[199],"58%":[201],"original":[204],"sparse-attention":[205]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-01T00:00:00"}
