{"id":"https://openalex.org/W4412886952","doi":"https://doi.org/10.18653/v1/2025.acl-industry.101","title":"TaDA: Training-free recipe for Decoding with Adaptive KV Cache Compression and Mean-centering","display_name":"TaDA: Training-free recipe for Decoding with Adaptive KV Cache Compression and Mean-centering","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4412886952","doi":"https://doi.org/10.18653/v1/2025.acl-industry.101"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2025.acl-industry.101","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-industry.101","pdf_url":"https://aclanthology.org/2025.acl-industry.101.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.acl-industry.101.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5054971357","display_name":"Vinay Joshi","orcid":"https://orcid.org/0000-0001-6031-1669"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Vinay Joshi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119180929","display_name":"Pratik Prabhanjan Brahma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pratik Prabhanjan Brahma","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101728120","display_name":"Zicheng Liu","orcid":"https://orcid.org/0000-0003-1106-2963"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zicheng Liu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5027115167","display_name":"Emad Barsoum","orcid":"https://orcid.org/0000-0002-4097-8690"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Emad Barsoum","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5054971357"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.10583748,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1435","last_page":"1443"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9854000210762024,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12326","display_name":"Network Packet Processing and Optimization","score":0.9833999872207642,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/recipe","display_name":"Recipe","score":0.8569687008857727},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.7472531795501709},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.7410394549369812},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6963557004928589},{"id":"https://openalex.org/keywords/compression","display_name":"Compression (physics)","score":0.4506067931652069},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.3948383629322052},{"id":"https://openalex.org/keywords/arithmetic","display_name":"Arithmetic","score":0.3633761405944824},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.22684231400489807},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.12666431069374084},{"id":"https://openalex.org/keywords/history","display_name":"History","score":0.08732303977012634}],"concepts":[{"id":"https://openalex.org/C2778671685","wikidata":"https://www.wikidata.org/wiki/Q219239","display_name":"Recipe","level":2,"score":0.8569687008857727},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.7472531795501709},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.7410394549369812},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6963557004928589},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.4506067931652069},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3948383629322052},{"id":"https://openalex.org/C94375191","wikidata":"https://www.wikidata.org/wiki/Q11205","display_name":"Arithmetic","level":1,"score":0.3633761405944824},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.22684231400489807},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.12666431069374084},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.08732303977012634},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.acl-industry.101","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-industry.101","pdf_url":"https://aclanthology.org/2025.acl-industry.101.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.acl-industry.101","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-industry.101","pdf_url":"https://aclanthology.org/2025.acl-industry.101.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4412886952.pdf","grobid_xml":"https://content.openalex.org/works/W4412886952.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3161239248","https://openalex.org/W258429745","https://openalex.org/W2740181000","https://openalex.org/W2561508161","https://openalex.org/W3123969856","https://openalex.org/W1584543623","https://openalex.org/W2098178683","https://openalex.org/W2740680361","https://openalex.org/W3195543079","https://openalex.org/W3187068967"],"abstract_inverted_index":{"The":[0],"key-value":[1],"(KV)":[2],"cache":[3,40,53,75,134],"in":[4,120,157],"transformer":[5],"models":[6,103,159],"is":[7],"a":[8,26,70,88],"critical":[9],"component":[10],"for":[11,29,73,101,152,164],"efficient":[12],"decoding":[13],"or":[14],"inference,":[15],"yet":[16],"its":[17],"memory":[18,135],"demands":[19],"scale":[20],"poorly":[21],"with":[22,77],"sequence":[23],"length,":[24],"posing":[25],"major":[27],"challenge":[28],"scalable":[30,153],"deployment":[31],"of":[32,43,139,174],"large":[33],"language":[34,158],"models.Among":[35],"several":[36],"approaches":[37],"to":[38,58,82,91,113,137],"KV":[39,52,74,133],"compression,":[41],"quantization":[42,54,78,123],"key":[44],"and":[45,61,87,154,171],"value":[46],"activations":[47],"has":[48],"been":[49],"widely":[50],"explored.Most":[51],"methods":[55],"still":[56],"need":[57,112],"manage":[59,115],"sparse":[60],"noncontiguous":[62],"outliers":[63],"separately.To":[64],"address":[65],"this,":[66],"we":[67],"introduce":[68],"TaDA,":[69],"training-free":[71],"recipe":[72],"compression":[76],"precision":[79],"that":[80,129],"adapts":[81],"error":[83],"sensitivity":[84],"across":[85],"layers":[86],"mean":[89],"centering":[90],"eliminate":[92],"separate":[93],"outlier":[94,116],"handling.Our":[95],"approach":[96,109],"yields":[97],"substantial":[98],"accuracy":[99],"improvements":[100],"multiple":[102],"supporting":[104],"various":[105],"context":[106,166],"lengths.Moreover,":[107],"our":[108,130],"does":[110],"not":[111],"separately":[114],"elements-a":[117],"persistent":[118],"hurdle":[119],"most":[121],"traditional":[122],"methods.Experiments":[124],"on":[125],"standard":[126],"benchmarks":[127],"demonstrate":[128],"technique":[131],"reduces":[132],"footprint":[136],"27%":[138],"the":[140,150],"original":[141],"16-bit":[142],"baseline":[143],"while":[144],"achieving":[145],"comparable":[146],"accuracy.Our":[147],"method":[148],"paves":[149],"way":[151],"high-performance":[155],"reasoning":[156,169],"by":[160],"potentially":[161],"enabling":[162],"inference":[163],"longer":[165,172],"length":[167],"models,":[168,170],"chain":[173],"thoughts.":[175]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
