{"id":"https://openalex.org/W7140223168","doi":"https://doi.org/10.48550/arxiv.2603.21957","title":"Unified Spatiotemporal Token Compression for Video-LLMs at Ultra-Low Retention","display_name":"Unified Spatiotemporal Token Compression for Video-LLMs at Ultra-Low Retention","publication_year":2026,"publication_date":"2026-03-23","ids":{"openalex":"https://openalex.org/W7140223168","doi":"https://doi.org/10.48550/arxiv.2603.21957"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.21957","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21957","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.21957","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Du, Junhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Du, Junhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Xue, Jialong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Jialong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Anqi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Anqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Dai, Jincheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Jincheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Lu, Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Guo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9230999946594238,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9230999946594238,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.032600000500679016,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.009100000374019146,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.760699987411499},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.503000020980835},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.48809999227523804},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.44839999079704285},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.44350001215934753},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.4341999888420105},{"id":"https://openalex.org/keywords/compression","display_name":"Compression (physics)","score":0.4334999918937683},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4171000123023987}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8284000158309937},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.760699987411499},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.503000020980835},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.48809999227523804},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4553999900817871},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.44839999079704285},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.44350001215934753},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.4341999888420105},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.4334999918937683},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4171000123023987},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.4092999994754791},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.3833000063896179},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.33009999990463257},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3156000077724457},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.30300000309944153},{"id":"https://openalex.org/C13481523","wikidata":"https://www.wikidata.org/wiki/Q412438","display_name":"Image compression","level":4,"score":0.29910001158714294},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2921999990940094},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.2858000099658966},{"id":"https://openalex.org/C94835093","wikidata":"https://www.wikidata.org/wiki/Q3113333","display_name":"Data compression ratio","level":5,"score":0.27630001306533813},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.2750000059604645},{"id":"https://openalex.org/C127532173","wikidata":"https://www.wikidata.org/wiki/Q179904","display_name":"Hourglass","level":2,"score":0.27309998869895935},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.25619998574256897},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.2547999918460846},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2542000114917755},{"id":"https://openalex.org/C25797200","wikidata":"https://www.wikidata.org/wiki/Q828137","display_name":"Compression ratio","level":3,"score":0.25040000677108765}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.21957","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21957","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.21957","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21957","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Video":[0],"large":[1,11],"language":[2],"models":[3],"(Video-LLMs)":[4],"face":[5],"high":[6,93],"computational":[7],"costs":[8],"due":[9],"to":[10,88,117,160],"volumes":[12],"of":[13,35,53,146,151],"visual":[14,54,147],"tokens.":[15],"Existing":[16],"token":[17,62,72,179,190],"compression":[18,25,63,120,180],"methods":[19],"typically":[20],"adopt":[21],"a":[22,65,70,77,132],"two-stage":[23],"spatiotemporal":[24,36,66,178],"strategy,":[26],"relying":[27],"on":[28,122],"stage-specific":[29],"metrics":[30],"and":[31,51,85,95,104,173],"an":[32],"implicit":[33],"assumption":[34],"separability.":[37],"Under":[38],"extremely":[39],"low":[40,96],"retention":[41,73],"ratios,":[42],"however,":[43],"such":[44],"approaches":[45],"often":[46],"result":[47],"in":[48,185],"unbalanced":[49],"allocation":[50,67],"loss":[52],"evidence":[55],"essential":[56],"for":[57],"question":[58],"answering.":[59],"We":[60,75],"reformulate":[61],"as":[64,131],"task":[68],"within":[69],"global":[71],"pool.":[74],"propose":[76],"unified":[78,177],"selection":[79],"mechanism":[80],"that":[81,141],"integrates":[82],"attention":[83],"weights":[84],"semantic":[86],"similarity":[87],"globally":[89],"select":[90],"tokens":[91,99,148],"with":[92,136],"contribution":[94],"redundancy.":[97],"Unselected":[98],"are":[100],"merged":[101],"via":[102],"clustering":[103],"refilled,":[105],"preserving":[106],"information":[107],"integrity.":[108],"Inside":[109],"the":[110,183],"LLM,":[111],"we":[112],"further":[113],"introduce":[114],"text-aware":[115],"merging":[116],"perform":[118],"secondary":[119],"based":[121],"query":[123],"relevance.":[124],"Without":[125],"requiring":[126],"retraining,":[127],"our":[128],"method":[129],"serves":[130],"plug-and-play":[133],"module":[134],"compatible":[135],"existing":[137],"Video-LLMs.":[138],"Experiments":[139],"show":[140],"retaining":[142],"only":[143],"about":[144],"2%":[145],"preserves":[149],"90.1%":[150],"baseline":[152],"performance":[153],"across":[154,166],"multiple":[155],"benchmarks,":[156],"while":[157],"reducing":[158],"FLOPs":[159],"roughly":[161],"2.6%.":[162],"These":[163],"benefits":[164],"generalize":[165],"diverse":[167],"backbones,":[168],"decreasing":[169],"end-to-end":[170],"inference":[171],"latency":[172],"memory":[174],"consumption.":[175],"Our":[176],"strategy":[181],"establishes":[182],"state-of-the-art":[184],"video":[186],"understanding":[187],"under":[188],"ultra-low":[189],"retention.":[191]},"counts_by_year":[],"updated_date":"2026-04-25T08:17:42.794288","created_date":"2026-03-25T00:00:00"}
