{"id":"https://openalex.org/W7155174631","doi":"https://doi.org/10.48550/arxiv.2604.19157","title":"SAW-INT4: System-Aware 4-Bit KV-Cache Quantization for Real-World LLM Serving","display_name":"SAW-INT4: System-Aware 4-Bit KV-Cache Quantization for Real-World LLM Serving","publication_year":2026,"publication_date":"2026-04-21","ids":{"openalex":"https://openalex.org/W7155174631","doi":"https://doi.org/10.48550/arxiv.2604.19157"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.19157","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.19157","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.19157","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5109706936","display_name":"Jinda Jia","orcid":"https://orcid.org/0009-0008-5283-5241"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jia, Jinda","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134312322","display_name":"Jisen Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Jisen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069302961","display_name":"Zhongzhu Zhou","orcid":"https://orcid.org/0000-0002-7786-6887"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Zhongzhu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056065406","display_name":"Jung Hwan Heo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Heo, Jung Hwan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134269588","display_name":"Jue Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134314192","display_name":"Tri Dao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dao, Tri","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134254553","display_name":"Shuaiwen Leon Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Shuaiwen Leon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013502032","display_name":"Ben Athiwaratkun","orcid":"https://orcid.org/0000-0002-2009-496X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Athiwaratkun, Ben","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134321105","display_name":"Chenfeng Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Chenfeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134283344","display_name":"Tianyi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Tianyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134256598","display_name":"Xiaoxia Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Xiaoxia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.567300021648407,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.567300021648407,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.1282999962568283,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.1046999990940094,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.6678000092506409},{"id":"https://openalex.org/keywords/concurrency","display_name":"Concurrency","score":0.5180000066757202},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.4652999937534332},{"id":"https://openalex.org/keywords/vector-quantization","display_name":"Vector quantization","score":0.45719999074935913},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.39640000462532043},{"id":"https://openalex.org/keywords/hadamard-transform","display_name":"Hadamard transform","score":0.3962000012397766}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6881999969482422},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.6678000092506409},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.5713000297546387},{"id":"https://openalex.org/C193702766","wikidata":"https://www.wikidata.org/wiki/Q1414548","display_name":"Concurrency","level":2,"score":0.5180000066757202},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.4652999937534332},{"id":"https://openalex.org/C199833920","wikidata":"https://www.wikidata.org/wiki/Q612536","display_name":"Vector quantization","level":2,"score":0.45719999074935913},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3986999988555908},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.39640000462532043},{"id":"https://openalex.org/C60292330","wikidata":"https://www.wikidata.org/wiki/Q1014065","display_name":"Hadamard transform","level":2,"score":0.3962000012397766},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.3458999991416931},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3434999883174896},{"id":"https://openalex.org/C81081738","wikidata":"https://www.wikidata.org/wiki/Q55542","display_name":"Lossless compression","level":3,"score":0.2897000014781952},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.27619999647140503},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.2685000002384186}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.19157","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.19157","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.19157","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.19157","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"KV-cache":[0,24,66,149,169],"memory":[1,42,45],"is":[2,78,130,171,185],"a":[3,80,140,173,186],"major":[4],"bottleneck":[5],"in":[6,54],"real-world":[7],"LLM":[8],"serving,":[9],"where":[10],"systems":[11,174],"must":[12],"simultaneously":[13],"support":[14],"latency-sensitive":[15],"small-batch":[16],"requests":[17],"and":[18,47,97,119,151],"high-throughput":[19],"concurrent":[20],"workloads.":[21],"Although":[22],"many":[23],"compression":[25,31,170],"methods":[26,68,114],"improve":[27],"offline":[28],"accuracy":[29,106,192],"or":[30],"ratio,":[32],"they":[33],"often":[34],"violate":[35],"practical":[36],"serving":[37,128,179,195],"constraints":[38],"such":[39,115],"as":[40,116],"paged":[41,148],"layouts,":[43],"regular":[44],"access,":[46],"fused":[48,141],"attention":[49],"execution,":[50],"limiting":[51],"their":[52],"effectiveness":[53],"deployment.":[55],"In":[56],"this":[57,99,136],"work,":[58],"we":[59,138],"identify":[60],"the":[61,90,105],"minimal":[62],"set":[63],"of":[64,104],"4-bit":[65],"quantization":[67,84,118,121],"that":[69,79,144,167,189],"remain":[70],"viable":[71,187],"under":[72,177],"these":[73],"constraints.":[74],"Our":[75,164],"central":[76],"finding":[77],"simple":[81],"design--token-wise":[82],"INT4":[83,159],"with":[85],"block-diagonal":[86,182],"Hadamard":[87,183],"rotation--consistently":[88],"achieves":[89],"best":[91],"accuracy-efficiency":[92],"trade-off.":[93],"Across":[94],"multiple":[95],"models":[96],"benchmarks,":[98],"approach":[100],"recovers":[101],"nearly":[102],"all":[103],"lost":[107],"by":[108],"naive":[109],"INT4,":[110],"while":[111],"more":[112],"complex":[113],"vector":[117],"Hessian-aware":[120],"provide":[122],"only":[123],"marginal":[124],"additional":[125],"gains":[126],"once":[127],"compatibility":[129],"taken":[131],"into":[132,147],"account.":[133],"To":[134],"make":[135],"practical,":[137],"implement":[139],"rotation-quantization":[142],"kernel":[143],"integrates":[145],"directly":[146],"layouts":[150],"introduces":[152],"zero":[153],"measurable":[154],"end-to-end":[155],"overhead,":[156],"matching":[157],"plain":[158],"throughput":[160],"across":[161],"concurrency":[162],"levels.":[163],"results":[165],"show":[166],"effective":[168],"fundamentally":[172],"co-design":[175],"problem:":[176],"real":[178],"constraints,":[180],"lightweight":[181],"rotation":[184],"method":[188],"delivers":[190],"near-lossless":[191],"without":[193],"sacrificing":[194],"efficiency.":[196]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-23T00:00:00"}
