{"id":"https://openalex.org/W7133528314","doi":"https://doi.org/10.1109/hpca68181.2026.11408603","title":"V-Rex: Real-Time Streaming Video LLM Acceleration via Dynamic KV Cache Retrieval","display_name":"V-Rex: Real-Time Streaming Video LLM Acceleration via Dynamic KV Cache Retrieval","publication_year":2026,"publication_date":"2026-01-31","ids":{"openalex":"https://openalex.org/W7133528314","doi":"https://doi.org/10.1109/hpca68181.2026.11408603"},"language":null,"primary_location":{"id":"doi:10.1109/hpca68181.2026.11408603","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca68181.2026.11408603","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128059194","display_name":"Donghyuk Kim","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Donghyuk Kim","raw_affiliation_strings":["KAIST,Daejeon,Republic of Korea"],"affiliations":[{"raw_affiliation_string":"KAIST,Daejeon,Republic of Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128042383","display_name":"Sejeong Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Sejeong Yang","raw_affiliation_strings":["KAIST,Daejeon,Republic of Korea"],"affiliations":[{"raw_affiliation_string":"KAIST,Daejeon,Republic of Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079083422","display_name":"Wonjin Shin","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Wonjin Shin","raw_affiliation_strings":["KAIST,Daejeon,Republic of Korea"],"affiliations":[{"raw_affiliation_string":"KAIST,Daejeon,Republic of Korea","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100447377","display_name":"Joo-Young Kim","orcid":"https://orcid.org/0000-0003-1099-1496"},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Joo-Young Kim","raw_affiliation_strings":["KAIST,Daejeon,Republic of Korea"],"affiliations":[{"raw_affiliation_string":"KAIST,Daejeon,Republic of Korea","institution_ids":["https://openalex.org/I157485424"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5128059194"],"corresponding_institution_ids":["https://openalex.org/I157485424"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.4875257,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"14"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.656000018119812,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.656000018119812,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10742","display_name":"Peer-to-Peer Network Technologies","score":0.11349999904632568,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11165","display_name":"Image and Video Quality Assessment","score":0.044199999421834946,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/acceleration","display_name":"Acceleration","score":0.4327999949455261},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.3626999855041504},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.30219998955726624},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.2506999969482422}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7110999822616577},{"id":"https://openalex.org/C117896860","wikidata":"https://www.wikidata.org/wiki/Q11376","display_name":"Acceleration","level":2,"score":0.4327999949455261},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.3626999855041504},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.32010000944137573},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.3156000077724457},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.30219998955726624},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2671000063419342},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.26269999146461487},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.2506999969482422},{"id":"https://openalex.org/C186967261","wikidata":"https://www.wikidata.org/wiki/Q5082128","display_name":"Mobile device","level":2,"score":0.2498999983072281}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpca68181.2026.11408603","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca68181.2026.11408603","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/7","display_name":"Affordable and clean energy","score":0.7523948550224304}],"awards":[],"funders":[{"id":"https://openalex.org/F4320335489","display_name":"Institute for Information and Communications Technology Promotion","ror":"https://ror.org/01g0hqq23"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W2620878116","https://openalex.org/W2771754046","https://openalex.org/W2964094654","https://openalex.org/W3006586535","https://openalex.org/W4308083513","https://openalex.org/W4389519587","https://openalex.org/W4390873312","https://openalex.org/W4400448267","https://openalex.org/W4401176373","https://openalex.org/W4402727512","https://openalex.org/W4404356490","https://openalex.org/W4404955778","https://openalex.org/W4409248360","https://openalex.org/W4409248709","https://openalex.org/W4409748580","https://openalex.org/W4410027467","https://openalex.org/W4411486373","https://openalex.org/W4411486388","https://openalex.org/W4411486604","https://openalex.org/W4411487104","https://openalex.org/W4415795803"],"related_works":[],"abstract_inverted_index":{"Streaming":[0],"video":[1,15,43,59,140,168,199,269,321],"large":[2],"language":[3],"models":[4,26],"(LLMs)":[5],"are":[6],"increasingly":[7],"used":[8],"for":[9,119,127,283,331],"real-time":[10,262,319],"multimodal":[11],"tasks":[12],"such":[13],"as":[14,247,249],"captioning,":[16],"question":[17],"answering,":[18],"conversational":[19],"agents,":[20],"and":[21,30,110,163,187,201,209,242,266,286,294,316],"augmented":[22],"reality.":[23],"However,":[24],"these":[25,128,222],"face":[27],"fundamental":[28],"memory":[29,82,131,135,197,253],"computational":[31],"challenges":[32],"because":[33],"their":[34],"key-value":[35],"(KV)":[36],"caches":[37],"grow":[38],"substantially":[39],"with":[40,232,275,328],"continuous":[41],"streaming":[42,58,167,268,320],"input.":[44],"This":[45,303],"process":[46],"requires":[47],"an":[48],"iterative":[49,96],"prefill":[50,97],"stage,":[51,98],"which":[52,74,122],"is":[53,117,123,305],"a":[54,177,227,233],"unique":[55],"feature":[56],"of":[57,139,216,263],"LLMs.":[60],"Prior":[61],"works":[62],"reduce":[63,193],"excessive":[64,194],"cache":[65,71,79,181,196,236,252,312],"overhead":[66],"by":[67],"utilizing":[68],"the":[69,76,88,124,134,153,214,289,306],"KV":[70,78,180,195,235,251,311],"retrieval":[72,182,237,313],"algorithm,":[73],"offloads":[75],"full":[77],"to":[80,94,192,212,308],"CPU":[81],"or":[83],"storage,":[84],"then":[85],"selectively":[86],"fetches":[87],"most":[89],"relevant":[90],"entries.":[91],"Nevertheless,":[92],"due":[93],"its":[95,172],"they":[99],"suffer":[100],"from":[101],"significant":[102],"limitations,":[103],"including":[104],"extensive":[105],"computation,":[106],"substantial":[107],"data":[108],"transfer,":[109],"degradation":[111],"in":[112,166,334],"accuracy.":[113],"Crucially,":[114],"this":[115,148],"issue":[116],"exacerbated":[118],"edge":[120,273,326],"deployment,":[121],"primary":[125],"target":[126],"models.":[129],"The":[130],"footprint":[132],"exceeds":[133],"capacity":[136],"within":[137],"minutes":[138],"streams,":[141],"making":[142],"low-latency,":[143],"energy-efficient":[144,267],"inference":[145,271,323],"infeasible.":[146],"In":[147],"work,":[149],"we":[150],"propose":[151],"V-Rex,":[152],"first":[154,307],"software-hardware":[155],"co-designed":[156],"accelerator":[157,231],"that":[158],"comprehensively":[159,309],"addresses":[160],"both":[161],"algorithmic":[162,223],"hardware":[164,230],"bottlenecks":[165],"LLM":[169,270,322],"inference.":[170],"At":[171],"core,":[173],"V-Rex":[174,225,259],"introduces":[175],"ReSV,":[176],"training-free":[178],"dynamic":[179,234],"algorithm.":[183],"ReSV":[184],"exploits":[185],"temporal":[186],"spatial":[188],"similarity-based":[189],"token":[190,204],"clustering":[191],"across":[198,314],"frames,":[200],"dynamically":[202],"adjusts":[203],"selection":[205],"per":[206],"transformer":[207],"layer":[208],"attention":[210],"head":[211],"minimize":[213],"number":[215],"selected":[217],"tokens.":[218],"To":[219],"fully":[220],"realize":[221],"benefits,":[224],"offers":[226],"compact,":[228],"low-latency":[229],"engine":[238],"(DRE),":[239],"featuring":[240],"bit-level":[241],"early-exit":[243],"based":[244],"computing":[245],"units,":[246],"well":[248],"hierarchical":[250],"management.":[254],"Evaluated":[255],"on":[256,272,324],"COIN":[257],"benchmarks,":[258],"achieves":[260],"unprecedented":[261],"3.9-8.3":[264],"FPS":[265],"deployment":[274,333],"negligible":[276],"accuracy":[277],"loss.":[278],"While":[279],"DRE":[280],"only":[281],"accounts":[282],"2.2%":[284],"power":[285],"2.0%":[287],"area,":[288],"system":[290],"delivers":[291],"1.9-19.7\u00d7":[292],"speedup":[293],"3.1-18.5\u00d7":[295],"energy":[296],"efficiency":[297],"improvements":[298],"over":[299],"AGX":[300],"Orin":[301],"GPU.":[302],"work":[304],"tackle":[310],"algorithm":[315],"hardware,":[317],"enabling":[318],"resource-constrained":[325],"devices,":[327],"clear":[329],"potential":[330],"scalable":[332],"large-scale":[335],"server":[336],"environments.":[337]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2026-03-05T00:00:00"}
