{"id":"https://openalex.org/W4410226886","doi":"https://doi.org/10.1109/jetcas.2025.3568716","title":"End-to-End Acceleration of Generative Models With Runtime Regularized KV Cache Management","display_name":"End-to-End Acceleration of Generative Models With Runtime Regularized KV Cache Management","publication_year":2025,"publication_date":"2025-05-09","ids":{"openalex":"https://openalex.org/W4410226886","doi":"https://doi.org/10.1109/jetcas.2025.3568716"},"language":"en","primary_location":{"id":"doi:10.1109/jetcas.2025.3568716","is_oa":false,"landing_page_url":"https://doi.org/10.1109/jetcas.2025.3568716","pdf_url":null,"source":{"id":"https://openalex.org/S142323794","display_name":"IEEE Journal on Emerging and Selected Topics in Circuits and Systems","issn_l":"2156-3357","issn":["2156-3357","2156-3365"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal on Emerging and Selected Topics in Circuits and Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5107078868","display_name":"Ashkan Moradifirouzabadi","orcid":"https://orcid.org/0009-0007-5112-300X"},"institutions":[{"id":"https://openalex.org/I36258959","display_name":"University of California San Diego","ror":"https://ror.org/0168r3w48","country_code":"US","type":"education","lineage":["https://openalex.org/I36258959"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ashkan Moradifirouzabadi","raw_affiliation_strings":["Department of Electrical and Computer Engineering (ECE), University of California at San Diego, San Diego, CA, USA","Department of Electrical and Computer Engineering (ECE), University of California, San Diego, USA"],"raw_orcid":"https://orcid.org/0009-0007-5112-300X","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering (ECE), University of California at San Diego, San Diego, CA, USA","institution_ids":["https://openalex.org/I36258959"]},{"raw_affiliation_string":"Department of Electrical and Computer Engineering (ECE), University of California, San Diego, USA","institution_ids":["https://openalex.org/I36258959"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5006712350","display_name":"Mingu Kang","orcid":"https://orcid.org/0000-0001-8104-5136"},"institutions":[{"id":"https://openalex.org/I36258959","display_name":"University of California San Diego","ror":"https://ror.org/0168r3w48","country_code":"US","type":"education","lineage":["https://openalex.org/I36258959"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mingu Kang","raw_affiliation_strings":["Department of Electrical and Computer Engineering (ECE), University of California at San Diego, San Diego, CA, USA","Department of Electrical and Computer Engineering (ECE), University of California, San Diego, USA"],"raw_orcid":"https://orcid.org/0000-0001-8104-5136","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering (ECE), University of California at San Diego, San Diego, CA, USA","institution_ids":["https://openalex.org/I36258959"]},{"raw_affiliation_string":"Department of Electrical and Computer Engineering (ECE), University of California, San Diego, USA","institution_ids":["https://openalex.org/I36258959"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I36258959"],"apc_list":null,"apc_paid":null,"fwci":3.1393,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.91735706,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":"15","issue":"2","first_page":"217","last_page":"230"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9897000193595886,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/acceleration","display_name":"Acceleration","score":0.6885398030281067},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.6309562921524048},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.6222006678581238},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5372308492660522},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.337289035320282},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.2921409010887146},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.1144646406173706}],"concepts":[{"id":"https://openalex.org/C117896860","wikidata":"https://www.wikidata.org/wiki/Q11376","display_name":"Acceleration","level":2,"score":0.6885398030281067},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.6309562921524048},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.6222006678581238},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5372308492660522},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.337289035320282},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.2921409010887146},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.1144646406173706},{"id":"https://openalex.org/C74650414","wikidata":"https://www.wikidata.org/wiki/Q11397","display_name":"Classical mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/jetcas.2025.3568716","is_oa":false,"landing_page_url":"https://doi.org/10.1109/jetcas.2025.3568716","pdf_url":null,"source":{"id":"https://openalex.org/S142323794","display_name":"IEEE Journal on Emerging and Selected Topics in Circuits and Systems","issn_l":"2156-3357","issn":["2156-3357","2156-3365"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal on Emerging and Selected Topics in Circuits and Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6700000166893005,"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":43,"referenced_works":["https://openalex.org/W2167425979","https://openalex.org/W2588191434","https://openalex.org/W2606722458","https://openalex.org/W2808513690","https://openalex.org/W2906043559","https://openalex.org/W2980104813","https://openalex.org/W3017024317","https://openalex.org/W3159727696","https://openalex.org/W3180037928","https://openalex.org/W3189877953","https://openalex.org/W3206453033","https://openalex.org/W3207622241","https://openalex.org/W4224267386","https://openalex.org/W4293025109","https://openalex.org/W4308083827","https://openalex.org/W4318541578","https://openalex.org/W4360832001","https://openalex.org/W4380881077","https://openalex.org/W4385245566","https://openalex.org/W4393407046","https://openalex.org/W4393407269","https://openalex.org/W4394998968","https://openalex.org/W4401211806","https://openalex.org/W4401211807","https://openalex.org/W4402349419","https://openalex.org/W4403676391","https://openalex.org/W4404954358","https://openalex.org/W6727099177","https://openalex.org/W6761628794","https://openalex.org/W6768817161","https://openalex.org/W6769627184","https://openalex.org/W6776048684","https://openalex.org/W6778883912","https://openalex.org/W6779709467","https://openalex.org/W6850162387","https://openalex.org/W6852408377","https://openalex.org/W6852872144","https://openalex.org/W6853192989","https://openalex.org/W6856619137","https://openalex.org/W6857690716","https://openalex.org/W6862187050","https://openalex.org/W6862644178","https://openalex.org/W6870066108"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2151749779","https://openalex.org/W3179968364","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109"],"abstract_inverted_index":{"Despite":[0],"their":[1],"remarkable":[2],"success":[3],"in":[4,39,74],"achieving":[5],"high":[6],"performance,":[7],"Transformer-based":[8],"models":[9],"impose":[10],"substantial":[11],"computational":[12,56],"and":[13,58,90,109,124,149,155,163,179,209],"memory":[14],"bandwidth":[15],"requirements,":[16],"posing":[17],"significant":[18],"challenges":[19,28],"for":[20],"hardware":[21,80,89],"deployment.":[22],"A":[23],"key":[24],"contributor":[25],"to":[26,41,53,168,187,227],"these":[27,69,84],"is":[29],"the":[30,42,55,62,75,105,174,202,228],"large":[31],"KV":[32,119,137,189],"cache,":[33],"which":[34,142],"increases":[35],"data":[36],"movement":[37],"costs":[38],"addition":[40],"model":[43],"parameters.":[44],"While":[45],"various":[46],"token":[47],"pruning":[48],"techniques":[49],"have":[50],"been":[51],"proposed":[52,183,203],"reduce":[54],"complexity":[57],"storage":[59,154],"requirements":[60,175],"of":[61,176,218],"attention":[63],"mechanism":[64],"by":[65],"eliminating":[66],"redundant":[67],"tokens,":[68],"methods":[70],"often":[71],"introduce":[72],"irregularities":[73],"sparsity":[76],"patterns":[77],"that":[78,103],"complicate":[79],"implementation.":[81],"To":[82,126],"address":[83],"challenges,":[85],"we":[86,130],"propose":[87],"a":[88,97,117,136,198,215,222],"algorithm":[91,102],"co-design":[92],"approach.":[93],"Our":[94],"solution":[95],"features":[96],"Runtime":[98],"Cache":[99],"Eviction":[100],"(RCE)":[101],"removes":[104],"least":[106],"relevant":[107],"tokens":[108,146],"replaces":[110],"them":[111],"with":[112,135,193,220],"newly":[113],"generated":[114],"ones,":[115],"maintaining":[116],"constant":[118],"cache":[120,190],"size":[121,191,217],"across":[122],"blocks":[123],"inputs.":[125],"support":[127],"this":[128],"algorithm,":[129],"design":[131,159],"an":[132,164],"accelerator":[133,204],"equipped":[134],"Memory":[138],"Management":[139],"Unit":[140],"(KV-MMU),":[141],"efficiently":[143],"manages":[144],"active":[145],"through":[147],"eviction":[148],"replacement,":[150],"thereby":[151],"optimizing":[152],"DRAM":[153],"access.":[156],"Additionally,":[157],"our":[158],"integrates":[160],"batch":[161,216],"processing":[162,166,214],"optimized":[165],"pipeline":[167],"improve":[169],"end-to-end":[170],"throughput,":[171],"effectively":[172],"meeting":[173],"both":[177],"pre-filling":[178],"generation":[180],"stages.":[181],"The":[182],"system":[184],"achieves":[185],"up":[186],"8\u00d7":[188],"reduction":[192],"minimal":[194],"accuracy":[195],"degradation.":[196],"In":[197],"65":[199],"nm":[200],"process,":[201],"demonstrates":[205],"1.52\u00d7":[206],"energy":[207,224],"savings":[208],"3.62\u00d7":[210],"delay":[211],"reductions":[212],"when":[213],"16,":[219],"only":[221],"1.11%":[223],"overhead":[225],"attributed":[226],"specialized":[229],"KV-MMU.":[230]},"counts_by_year":[{"year":2026,"cited_by_count":3}],"updated_date":"2026-06-26T08:34:08.712188","created_date":"2025-10-10T00:00:00"}
