{"id":"https://openalex.org/W7135185364","doi":"https://doi.org/10.48550/arxiv.2603.11504","title":"LongFlow: Efficient KV Cache Compression for Reasoning Models","display_name":"LongFlow: Efficient KV Cache Compression for Reasoning Models","publication_year":2026,"publication_date":"2026-03-12","ids":{"openalex":"https://openalex.org/W7135185364","doi":"https://doi.org/10.48550/arxiv.2603.11504"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.11504","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11504","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.11504","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128938384","display_name":"Yi Su","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Su, Yi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025685427","display_name":"Zhenxu Tian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Zhenxu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129074979","display_name":"Dan Qiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiao, Dan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033814148","display_name":"Yuechi Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Yuechi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129030935","display_name":"Juntao Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Juntao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129088681","display_name":"Min Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Min","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5128938384"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.40070000290870667,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.40070000290870667,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10126","display_name":"Logic, programming, and type systems","score":0.11550000309944153,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13523","display_name":"Mathematics, Computing, and Information Processing","score":0.06719999760389328,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.6965000033378601},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.4968999922275543},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.44589999318122864},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.43619999289512634},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.4108000099658966},{"id":"https://openalex.org/keywords/cache-algorithms","display_name":"Cache algorithms","score":0.4056999981403351},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.3889999985694885},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.35690000653266907}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7985000014305115},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.6965000033378601},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.4968999922275543},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.44589999318122864},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.43619999289512634},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.4255000054836273},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4131999909877777},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.4108000099658966},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.4056999981403351},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.3889999985694885},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.35690000653266907},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.35429999232292175},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.3379000127315521},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.3255999982357025},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.325300008058548},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.30320000648498535},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.30140000581741333},{"id":"https://openalex.org/C2780898871","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Performance metric","level":2,"score":0.27950000762939453},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.27790001034736633},{"id":"https://openalex.org/C201148951","wikidata":"https://www.wikidata.org/wiki/Q5015976","display_name":"Cache coloring","level":4,"score":0.26809999346733093},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.26330000162124634},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.26330000162124634},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.2554999887943268}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.11504","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11504","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.11504","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11504","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"reasoning":[1,17,77],"models":[2],"such":[3],"as":[4],"OpenAI-o1":[5],"and":[6,18,50,69,88,136,152],"DeepSeek-R1":[7],"have":[8],"shown":[9],"strong":[10],"performance":[11,23],"on":[12,183],"complex":[13],"tasks":[14],"including":[15],"mathematical":[16],"code":[19],"generation.":[20,98],"However,":[21],"this":[22],"gain":[24],"comes":[25],"with":[26,111,175,180],"substantially":[27],"longer":[28],"output":[29],"sequences,":[30],"leading":[31],"to":[32,169],"significantly":[33],"increased":[34],"deployment":[35],"costs.":[36],"In":[37],"particular,":[38],"long":[39,97],"outputs":[40],"require":[41],"large":[42],"KV":[43,59,107,177],"caches,":[44],"resulting":[45],"in":[46,82],"high":[47],"memory":[48],"consumption":[49],"severe":[51],"bandwidth":[52],"pressure":[53],"during":[54,96],"attention":[55,123],"computation.":[56],"Most":[57],"existing":[58],"cache":[60,108,178],"optimization":[61],"methods":[62],"are":[63,70],"designed":[64],"for":[65,72],"long-input,":[66],"short-output":[67],"scenarios":[68],"ineffective":[71],"the":[73,127],"long-output":[74],"setting":[75],"of":[76,122],"models.":[78],"Moreover,":[79],"importance":[80,114,150],"estimation":[81,115],"prior":[83],"work":[84],"is":[85,94],"computationally":[86],"expensive":[87],"becomes":[89],"prohibitive":[90],"when":[91],"continuous":[92],"re-evaluation":[93],"required":[95],"To":[99],"address":[100],"these":[101],"challenges,":[102],"we":[103],"propose":[104],"LongFlow,":[105],"a":[106,144,156],"compression":[109,179],"method":[110],"an":[112,119,170],"efficient":[113],"metric":[116],"derived":[117],"from":[118],"intermediate":[120],"result":[121],"computation":[124],"using":[125],"only":[126],"current":[128],"query.":[129],"This":[130],"design":[131],"introduces":[132],"negligible":[133],"computational":[134],"overhead":[135],"requires":[137],"no":[138],"auxiliary":[139],"storage.":[140],"We":[141],"further":[142],"develop":[143],"custom":[145],"kernel":[146],"that":[147,165],"fuses":[148],"FlashAttention,":[149],"estimation,":[151],"token":[153],"eviction":[154],"into":[155],"single":[157],"optimized":[158],"operator,":[159],"improving":[160],"system-level":[161],"efficiency.":[162],"Experiments":[163],"show":[164],"LongFlow":[166],"achieves":[167],"up":[168],"11.8":[171],"times":[172],"throughput":[173],"improvement":[174],"80%":[176],"minimal":[181],"impact":[182],"model":[184],"accuracy.":[185]},"counts_by_year":[],"updated_date":"2026-04-29T06:10:49.150238","created_date":"2026-03-14T00:00:00"}
