{"id":"https://openalex.org/W7133522572","doi":"https://doi.org/10.1109/hpca68181.2026.11408481","title":"BitDecoding: Unlocking Tensor Cores for Long-Context LLMs with Low-Bit KV Cache","display_name":"BitDecoding: Unlocking Tensor Cores for Long-Context LLMs with Low-Bit KV Cache","publication_year":2026,"publication_date":"2026-01-31","ids":{"openalex":"https://openalex.org/W7133522572","doi":"https://doi.org/10.1109/hpca68181.2026.11408481"},"language":null,"primary_location":{"id":"doi:10.1109/hpca68181.2026.11408481","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca68181.2026.11408481","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5036368309","display_name":"Dayou Du","orcid":null},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Dayou Du","raw_affiliation_strings":["University of Edinburgh"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101401413","display_name":"Shijie Cao","orcid":null},"institutions":[{"id":"https://openalex.org/I4210164937","display_name":"Microsoft Research (United Kingdom)","ror":"https://ror.org/05k87vq12","country_code":"GB","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210164937"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Shijie Cao","raw_affiliation_strings":["Microsoft Research"],"affiliations":[{"raw_affiliation_string":"Microsoft Research","institution_ids":["https://openalex.org/I4210164937"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060302344","display_name":"Jianyi Cheng","orcid":"https://orcid.org/0000-0003-2791-2555"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Jianyi Cheng","raw_affiliation_strings":["University of Edinburgh"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034207820","display_name":"Luo Mai","orcid":"https://orcid.org/0000-0002-3594-1092"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Luo Mai","raw_affiliation_strings":["University of Edinburgh"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123935909","display_name":"Ting Cao","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ting Cao","raw_affiliation_strings":["Institute for AI Industry Research (AIR), Tsinghua University"],"affiliations":[{"raw_affiliation_string":"Institute for AI Industry Research (AIR), Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5128074352","display_name":"Mao Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210164937","display_name":"Microsoft Research (United Kingdom)","ror":"https://ror.org/05k87vq12","country_code":"GB","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210164937"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Mao Yang","raw_affiliation_strings":["Microsoft Research"],"affiliations":[{"raw_affiliation_string":"Microsoft Research","institution_ids":["https://openalex.org/I4210164937"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5036368309"],"corresponding_institution_ids":["https://openalex.org/I98677209"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.9259724,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"13"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.349700003862381,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.349700003862381,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.2808000147342682,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12326","display_name":"Network Packet Processing and Optimization","score":0.11190000176429749,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.4081000089645386},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.3930000066757202},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.26809999346733093},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.25209999084472656},{"id":"https://openalex.org/keywords/core","display_name":"Core (optical fiber)","score":0.24979999661445618}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.44589999318122864},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.42419999837875366},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.4081000089645386},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.3930000066757202},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.28679999709129333},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.26809999346733093},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.25209999084472656},{"id":"https://openalex.org/C62649853","wikidata":"https://www.wikidata.org/wiki/Q199687","display_name":"Remote sensing","level":1,"score":0.25119999051094055},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.24979999661445618},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.22910000383853912}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpca68181.2026.11408481","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca68181.2026.11408481","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5095616579055786,"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":11,"referenced_works":["https://openalex.org/W2277429057","https://openalex.org/W2954698171","https://openalex.org/W4385573671","https://openalex.org/W4387321091","https://openalex.org/W4389518760","https://openalex.org/W4400409880","https://openalex.org/W4408029577","https://openalex.org/W4409248690","https://openalex.org/W4415796210","https://openalex.org/W4415797154","https://openalex.org/W4415799038"],"related_works":[],"abstract_inverted_index":{"The":[0],"rise":[1],"of":[2,60],"long-context":[3,70,246],"Large":[4],"Language":[5],"Models":[6],"(LLMs)":[7],"amplifies":[8],"memory":[9,35],"and":[10,90,134,145,156,172,192,206,248],"bandwidth":[11],"demands":[12],"during":[13],"autoregressive":[14],"decoding,":[15],"as":[16],"the":[17,183,220],"Key-Value":[18],"(KV)":[19],"cache":[20,83],"grows":[21],"with":[22,74,106,142,149,212,231],"each":[23],"generated":[24],"token.":[25],"Low-bit":[26],"KV-cache":[27],"quantization":[28,128,140],"(e.g.,":[29],"4-bit":[30],"or":[31],"2-bit)":[32],"can":[33],"reduce":[34],"footprint":[36],"while":[37,218],"preserving":[38],"accuracy,":[39],"but":[40],"existing":[41],"systems":[42],"suffer":[43],"from":[44],"slow":[45],"decoding":[46,84,180,201,238],"due":[47],"to":[48,101,130,153,178,210,227],"their":[49],"exclusive":[50],"reliance":[51],"on":[52,62,182,188,197,216],"CUDA":[53,88,155],"cores,":[54],"neglecting":[55],"Tensor":[56,91,103,157],"Cores\u2014the":[57],"primary":[58],"source":[59],"compute":[61],"modern":[63],"GPUs.":[64],"We":[65],"present":[66],"BitDecoding,":[67],"a":[68,119,127,146,150,199,232],"new":[69],"LLMs":[71],"inference":[72],"system":[73,115,223],"low-bit":[75,81,222],"KV":[76,82],"cache.":[77],"BitDecoding":[78,117,195,235],"enables":[79],"efficient":[80],"by":[85,225,240],"cooperatively":[86],"leveraging":[87],"Cores":[89,158],"Cores.":[92],"It":[93],"introduces":[94],"methods":[95],"for":[96,111,160,245],"automatically":[97],"inducing":[98],"optimized":[99],"layouts":[100],"exploit":[102],"Cores,":[104],"along":[105],"novel":[107],"warp-level":[108],"parallelization":[109],"strategies":[110],"dequantization.":[112],"For":[113],"unified":[114],"support,":[116],"includes":[118],"query":[120],"transformation":[121],"module":[122],"supporting":[123],"diverse":[124],"attention":[125],"variants,":[126],"kernel":[129,148],"support":[131],"both":[132],"tensor-wise":[133],"channelwise":[135],"scaling":[136],"used":[137],"in":[138],"various":[139],"algorithms":[141],"high":[143],"performance,":[144],"dequantization":[147],"softwaredefined":[151],"pipeline":[152],"coordinate":[154],"execution":[159],"mix-precision":[161],"operations.":[162],"In":[163],"addition,":[164],"architecture-specific":[165],"optimizations":[166],"leverage":[167],"Hopper's":[168],"warpgroup":[169],"tensor":[170,176],"instructions":[171],"Blackwell's":[173],"native":[174,213],"low-precision":[175],"formats":[177,215],"maximize":[179],"throughput":[181],"latest":[184],"GPU":[185],"generations.":[186],"Evaluated":[187],"Blackwell,":[189,217],"Hopper,":[190],"Ada,":[191],"Ampere":[193],"architectures,":[194],"attains":[196],"average":[198],"7.5\u00d7":[200],"speedup":[202],"over":[203],"FP16":[204],"FlashDecoding-v2,":[205],"further":[207],"reaches":[208],"up":[209,226],"8.6\u00d7":[211],"MXFP4":[214],"surpassing":[219],"state-of-the-art":[221],"QServe":[224],"4.3\u00d7.":[228],"On":[229],"LLaMA-3.1-8B":[230],"128K":[233],"context,":[234],"reduces":[236],"singlebatch":[237],"latency":[239],"3\u00d7,":[241],"demonstrating":[242],"substantial":[243],"improvements":[244],"generation,":[247],"is":[249],"open":[250],"sourced":[251],"at":[252],"https://github.com/OpenBitSys/BitDecoding.":[253]},"counts_by_year":[],"updated_date":"2026-03-06T06:45:51.903784","created_date":"2026-03-05T00:00:00"}
