{"id":"https://openalex.org/W7138355746","doi":"https://doi.org/10.1609/aaai.v40i7.37467","title":"AccKV: Towards Efficient Audio-Video LLMs Inference via Adaptive-Focusing and Cross-Calibration KV Cache Optimization","display_name":"AccKV: Towards Efficient Audio-Video LLMs Inference via Adaptive-Focusing and Cross-Calibration KV Cache Optimization","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138355746","doi":"https://doi.org/10.1609/aaai.v40i7.37467"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i7.37467","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i7.37467","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37467/41429","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37467/41429","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129723362","display_name":"Zhonghua Jiang","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhonghua Jiang","raw_affiliation_strings":["Zhejiang University"],"affiliations":[{"raw_affiliation_string":"Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100682687","display_name":"Kui Chen","orcid":"https://orcid.org/0000-0002-3910-1402"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kui Chen","raw_affiliation_strings":["Zhejiang University"],"affiliations":[{"raw_affiliation_string":"Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129683499","display_name":"Kunxi Li","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kunxi Li","raw_affiliation_strings":["Zhejiang University"],"affiliations":[{"raw_affiliation_string":"Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120655297","display_name":"Keting Yin","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Keting Yin","raw_affiliation_strings":["Zhejiang University"],"affiliations":[{"raw_affiliation_string":"Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129713493","display_name":"Yiyun Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yiyun Zhou","raw_affiliation_strings":["Zhejiang University"],"affiliations":[{"raw_affiliation_string":"Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129753256","display_name":"Zhaode Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhaode Wang","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129662631","display_name":"Chengfei Lv","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chengfei Lv","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129670063","display_name":"Shengyu Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shengyu Zhang","raw_affiliation_strings":["Zhejiang University"],"affiliations":[{"raw_affiliation_string":"Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5129723362"],"corresponding_institution_ids":["https://openalex.org/I76130692"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.71865672,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"7","first_page":"5494","last_page":"5502"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.885200023651123,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.885200023651123,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0272000003606081,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.014000000432133675,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.7712000012397766},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5198000073432922},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.48890000581741333},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.46939998865127563},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.42989999055862427},{"id":"https://openalex.org/keywords/reservation","display_name":"Reservation","score":0.4058000147342682},{"id":"https://openalex.org/keywords/dialog-box","display_name":"Dialog box","score":0.3709999918937683},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.33410000801086426}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8478999733924866},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.7712000012397766},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5198000073432922},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.48890000581741333},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.46939998865127563},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.42989999055862427},{"id":"https://openalex.org/C2777632111","wikidata":"https://www.wikidata.org/wiki/Q1937518","display_name":"Reservation","level":2,"score":0.4058000147342682},{"id":"https://openalex.org/C173853756","wikidata":"https://www.wikidata.org/wiki/Q86915","display_name":"Dialog box","level":2,"score":0.3709999918937683},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.34450000524520874},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3386000096797943},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3368000090122223},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.33410000801086426},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3334999978542328},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.32919999957084656},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.30379998683929443},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.2987000048160553},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.2935999929904938},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.28529998660087585},{"id":"https://openalex.org/C2781140086","wikidata":"https://www.wikidata.org/wiki/Q557945","display_name":"Confusion","level":2,"score":0.2833000123500824},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.28130000829696655},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.28110000491142273},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.2799000144004822},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.2750000059604645},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2549999952316284},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.2508000135421753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i7.37467","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i7.37467","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37467/41429","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i7.37467","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i7.37467","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37467/41429","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.5752983689308167,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7138355746.pdf","grobid_xml":"https://content.openalex.org/works/W7138355746.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advancements":[1],"in":[2,12,31,63,76],"Audio-Video":[3],"Large":[4],"Language":[5],"Models":[6],"(AV-LLMs)":[7],"have":[8],"enhanced":[9],"their":[10],"capabilities":[11],"tasks":[13],"like":[14],"audio-visual":[15],"question":[16],"answering":[17],"and":[18,23,51,112,122,130,163,196,223,226],"multimodal":[19],"dialog":[20],"systems.":[21],"Video":[22],"audio":[24,56,111,129,222],"introduce":[25],"an":[26,161],"extended":[27],"temporal":[28,108],"dimension,":[29],"resulting":[30],"a":[32,145,211],"larger":[33],"key-value":[34],"(KV)":[35],"cache":[36,166,238],"compared":[37],"to":[38,47,73,119,139,190,234],"static":[39],"image":[40],"embedding.":[41],"A":[42],"naive":[43],"optimization":[44,167],"strategy":[45],"is":[46,80,177],"selectively":[48,184,235],"focus":[49],"on":[50,60,84,179,186],"retain":[52],"KV":[53,109,114,165,218,237],"caches":[54,219],"of":[55,71,92,110,115,126,144,193,200,239,254],"or":[57,142],"video":[58,98,116,131,224],"based":[59,178],"task.":[61,86],"However,":[62],"the":[64,69,77,85,90,97,150,191,198,221,251],"experiment,":[65],"we":[66,102,158,209],"observed":[67],"that":[68,105,214,246],"attention":[70,91,205],"AV-LLMs":[72,93,173,255],"various":[74],"modalities":[75,188],"high":[78],"layers":[79],"not":[81],"strictly":[82],"dependent":[83],"In":[87,100,207],"higher":[88],"layers,":[89,195],"shifts":[94],"more":[95],"towards":[96],"modality.":[99,241],"addition,":[101,208],"also":[103,137],"found":[104],"directly":[106],"integrating":[107],"spatial-temporal":[113],"may":[117,136],"lead":[118,138],"information":[120],"confusion":[121],"significant":[123],"performance":[124],"degradation":[125],"AV-LLMs.":[127],"If":[128],"are":[132],"processed":[133],"indiscriminately,":[134],"it":[135],"excessive":[140],"compression":[141],"reservation":[143],"certain":[146],"modality,":[147],"thereby":[148],"disrupting":[149],"alignment":[151],"between":[152],"modalities.":[153],"To":[154],"address":[155],"these":[156],"challenges,":[157],"propose":[159,210],"AccKV,":[160],"Adaptive-Focusing":[162],"Cross-Calibration":[164,212],"framework":[168],"designed":[169],"specifically":[170],"for":[171],"efficient":[172],"inference.":[174],"Our":[175],"method":[176],"layer":[180],"adaptive":[181],"focusing":[182,185],"technology,":[183],"key":[187],"according":[189],"characteristics":[192],"different":[194],"enhances":[197],"recognition":[199],"heavy":[201],"hitter":[202],"tokens":[203],"through":[204],"redistribution.":[206],"technique":[213],"first":[215],"integrates":[216],"inefficient":[217],"within":[220],"modalities,":[225],"then":[227],"aligns":[228],"low-priority":[229,240],"modality":[230,233],"with":[231],"high-priority":[232],"evict":[236],"The":[242],"experimental":[243],"results":[244],"show":[245],"AccKV":[247],"can":[248],"significantly":[249],"improve":[250],"computational":[252],"efficiency":[253],"while":[256],"maintaining":[257],"accuracy.":[258]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
