{"id":"https://openalex.org/W7161742294","doi":"https://doi.org/10.48550/arxiv.2605.17757","title":"OSCAR: Offline Spectral Covariance-Aware Rotation for 2-bit KV Cache Quantization","display_name":"OSCAR: Offline Spectral Covariance-Aware Rotation for 2-bit KV Cache Quantization","publication_year":2026,"publication_date":"2026-05-18","ids":{"openalex":"https://openalex.org/W7161742294","doi":"https://doi.org/10.48550/arxiv.2605.17757"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.17757","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.17757","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.17757","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136466531","display_name":"Zhongzhu Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Zhongzhu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051047559","display_name":"Donglin Zhuang","orcid":"https://orcid.org/0000-0003-3355-407X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhuang, Donglin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134312322","display_name":"Jisen Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Jisen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101820620","display_name":"Ziyan Chen","orcid":"https://orcid.org/0000-0001-6277-5635"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Ziyan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136482519","display_name":"Shuaiwen Leon Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Shuaiwen Leon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013502032","display_name":"Ben Athiwaratkun","orcid":"https://orcid.org/0000-0002-2009-496X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Athiwaratkun, Ben","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136484328","display_name":"Xiaoxia Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Xiaoxia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.1793999969959259,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.1793999969959259,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.12030000239610672,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.09950000047683716,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.5131000280380249},{"id":"https://openalex.org/keywords/vector-quantization","display_name":"Vector quantization","score":0.4528000056743622},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.4415000081062317},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.4357999861240387},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4169999957084656},{"id":"https://openalex.org/keywords/rotation","display_name":"Rotation (mathematics)","score":0.3750999867916107},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.3197999894618988}],"concepts":[{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.6115999817848206},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6101999878883362},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.5131000280380249},{"id":"https://openalex.org/C199833920","wikidata":"https://www.wikidata.org/wiki/Q612536","display_name":"Vector quantization","level":2,"score":0.4528000056743622},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.4415000081062317},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.4357999861240387},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4169999957084656},{"id":"https://openalex.org/C74050887","wikidata":"https://www.wikidata.org/wiki/Q848368","display_name":"Rotation (mathematics)","level":2,"score":0.3750999867916107},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.3197999894618988},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.30630001425743103},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2930999994277954},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.27219998836517334},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.27090001106262207},{"id":"https://openalex.org/C178650346","wikidata":"https://www.wikidata.org/wiki/Q201984","display_name":"Covariance","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C60292330","wikidata":"https://www.wikidata.org/wiki/Q1014065","display_name":"Hadamard transform","level":2,"score":0.266400009393692},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.2639999985694885},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.25940001010894775},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.17757","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.17757","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.17757","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.17757","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"INT2":[0,31,101,166,207],"KV-cache":[1,109,212],"quantization":[2,47,73],"is":[3],"attractive":[4],"for":[5,65],"long-context":[6],"LLM":[7,120],"serving,":[8],"but":[9,27,90],"it":[10,70,182],"remains":[11,105,183,198],"difficult":[12],"to":[13,58,141,157,168,175,195,221,238,243],"make":[14],"both":[15,201],"accurate":[16],"and":[17,55,62,111,126,149,159,177,232],"deployable.":[18],"Simple":[19],"rotations":[20,61],"such":[21,123],"as":[22,124],"Hadamard":[23],"transforms":[24],"reduce":[25],"outliers,":[26],"still":[28],"degrade":[29],"at":[30,223],"because":[32],"they":[33],"are":[34],"not":[35,85],"aligned":[36],"with":[37,74,98,107,136,187],"downstream":[38],"attention.":[39],"We":[40,128,171],"propose":[41],"OSCAR,":[42],"an":[43],"Ultra-low-bit":[44],"KV":[45,72],"Cache":[46],"method":[48],"that":[49,78,104],"estimates":[50],"attention-aware":[51],"covariance":[52,76],"structures":[53,77],"offline":[54],"uses":[56],"them":[57],"derive":[59],"fixed":[60],"clipping":[63],"thresholds":[64],"quantization.":[66],"In":[67],"this":[68],"way,":[69],"aligns":[71],"the":[75,153,228],"attention":[79,102],"actually":[80],"consumes.":[81],"More":[82],"importantly,":[83],"we":[84],"only":[86],"provide":[87],"theoretical":[88],"justification":[89],"also":[91],"develop":[92],"a":[93,99],"fully":[94],"deployable":[95],"OSCAR":[96,151,174,197,210],"system":[97],"custom":[100],"kernel":[103,113],"compatible":[106],"paged":[108],"serving":[110,121],"fused":[112],"pipelines,":[114],"enabling":[115],"seamless":[116],"integration":[117],"into":[118],"modern":[119],"frameworks":[122],"SGLang":[125],"vLLM.":[127],"evaluate":[129],"our":[130],"methods":[131],"on":[132,185,200],"recent":[133],"reasoning":[134,137],"models":[135],"traces":[138],"of":[139],"up":[140,194,220,237],"32k":[142],"tokens":[143],"across":[144],"5":[145],"tasks.":[146],"On":[147,189],"Qwen3-4B-Thinking-2507":[148],"Qwen3-8B,":[150],"reduces":[152,211],"BF16":[154,241],"accuracy":[155],"gap":[156],"3.78":[158],"1.42":[160],"points,":[161],"respectively,":[162],"while":[163,204],"naive":[164,205],"rotation":[165,206],"collapses":[167],"nearly":[169],"zero.":[170],"further":[172],"scale":[173],"Qwen3-32B":[176],"GLM-4.7":[178],"(358B":[179],"params),":[180],"where":[181],"effectively":[184],"par":[186],"BF16.":[188],"long":[190],"context":[191],"-":[192],"RULER-NIAH":[193],"128K,":[196],"robust":[199],"Qwen3":[202],"models,":[203],"collapses.":[208],"System-wise,":[209],"memory":[213,230,245],"by":[214,219,236],"approximately":[215],"8x,":[216],"improves":[217],"throughput":[218],"7x":[222],"large":[224],"batch":[225],"sizes":[226],"under":[227],"same":[229],"budget,":[231],"accelerates":[233],"batch-size-1":[234],"decoding":[235],"3x":[239],"over":[240],"due":[242],"reduced":[244],"bandwidth":[246],"overhead.":[247]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-20T00:00:00"}
