{"id":"https://openalex.org/W7161003903","doi":"https://doi.org/10.48550/arxiv.2605.11803","title":"OTT-Vid: Optimal Transport Temporal Token Compression for Video Large Language Models","display_name":"OTT-Vid: Optimal Transport Temporal Token Compression for Video Large Language Models","publication_year":2026,"publication_date":"2026-05-12","ids":{"openalex":"https://openalex.org/W7161003903","doi":"https://doi.org/10.48550/arxiv.2605.11803"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.11803","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11803","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.11803","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136034319","display_name":"Minseok Kang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kang, Minseok","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136068465","display_name":"Minhyeok Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Minhyeok","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136048625","display_name":"Jungho Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Jungho","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136056628","display_name":"Minjung Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Minjung","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136068107","display_name":"Donghyeong Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Donghyeong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103095190","display_name":"Dayeon Lee","orcid":"https://orcid.org/0000-0003-2806-9993"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Dayeon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023109720","display_name":"Heeseung Choi","orcid":"https://orcid.org/0000-0003-3223-1885"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Choi, Heeseung","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014088881","display_name":"Ig-Jae Kim","orcid":"https://orcid.org/0000-0002-2741-7047"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Ig-jae","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136058056","display_name":"Sangyoun Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Sangyoun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7343000173568726,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7343000173568726,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.08399999886751175,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.04490000009536743,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.8395000100135803},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.5644999742507935},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5353999733924866},{"id":"https://openalex.org/keywords/compression","display_name":"Compression (physics)","score":0.5127000212669373},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.4790000021457672},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.427700012922287},{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.41940000653266907},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.3928000032901764},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.38769999146461487}],"concepts":[{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.8395000100135803},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7906000018119812},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.5644999742507935},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5353999733924866},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.5127000212669373},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.4790000021457672},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4456999897956848},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.427700012922287},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.41940000653266907},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3928000032901764},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.38769999146461487},{"id":"https://openalex.org/C20556612","wikidata":"https://www.wikidata.org/wiki/Q4469374","display_name":"Volume (thermodynamics)","level":2,"score":0.3799999952316284},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.3765999972820282},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.37130001187324524},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.34880000352859497},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.3483999967575073},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.34310001134872437},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.32120001316070557},{"id":"https://openalex.org/C118930307","wikidata":"https://www.wikidata.org/wiki/Q600590","display_name":"Tuple","level":2,"score":0.3027999997138977},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2989000082015991},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.2946999967098236},{"id":"https://openalex.org/C39394851","wikidata":"https://www.wikidata.org/wiki/Q921594","display_name":"Inter frame","level":4,"score":0.2937000095844269},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.27410000562667847},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.2727999985218048},{"id":"https://openalex.org/C57654395","wikidata":"https://www.wikidata.org/wiki/Q1097775","display_name":"Compression artifact","level":5,"score":0.2653999924659729},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2651999890804291},{"id":"https://openalex.org/C142575187","wikidata":"https://www.wikidata.org/wiki/Q3358290","display_name":"Pyramid (geometry)","level":2,"score":0.26460000872612},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.2590999901294708},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.25119999051094055}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.11803","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11803","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.11803","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11803","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"Video":[1],"Large":[2],"Language":[3],"Models":[4],"(Video-LLMs)":[5],"scale":[6],"to":[7,19,38,65,69,114,171],"longer":[8],"and":[9,63,104,134,142,153,184,194],"more":[10],"complex":[11],"videos,":[12],"their":[13],"inference":[14],"cost":[15,137,159],"grows":[16],"rapidly":[17],"due":[18],"the":[20,70,161],"large":[21],"volume":[22],"of":[23,72,93,164,192,196,203],"visual":[24],"tokens":[25,130],"accumulated":[26],"across":[27],"frames.":[28],"Training-free":[29],"token":[30,50,88,124,151],"compression":[31,44,67,173,210],"has":[32],"emerged":[33],"as":[34],"a":[35,82,135],"practical":[36],"solution":[37],"this":[39,77,120],"bottleneck.":[40],"However,":[41],"existing":[42,207],"temporal":[43,87,116,185],"methods":[45],"rely":[46],"primarily":[47],"on":[48,177],"cross-frame":[49],"similarity":[51],"or":[52],"segmentation":[53],"heuristics,":[54],"overlooking":[55],"each":[56,73,102,165],"token's":[57],"semantic":[58],"role":[59],"within":[60,101],"its":[61,157],"frame":[62,74,166],"failing":[64],"adapt":[66],"strength":[68],"compressibility":[71],"pair.":[75],"In":[76],"work,":[78],"we":[79,169],"propose":[80],"OTT-Vid,":[81],"transport-derived":[83],"allocation":[84],"framework":[85],"for":[86],"compression.":[89],"Our":[90],"approach":[91],"consists":[92],"two":[94],"stages:":[95],"spatial":[96,143],"pruning":[97],"identifies":[98],"representative":[99],"content":[100],"frame,":[103],"optimal":[105],"transport":[106,147,162],"(OT)":[107],"is":[108],"then":[109],"solved":[110],"between":[111],"neighboring":[112],"frames":[113],"estimate":[115],"compressibility.":[117],"We":[118],"formulate":[119],"OT":[121],"with":[122],"non-uniform":[123],"mass,":[125],"which":[126,168],"protects":[127],"semantically":[128],"important":[129],"from":[131],"aggressive":[132],"compression,":[133],"locality-aware":[136],"that":[138,188],"captures":[139],"both":[140],"feature":[141],"disparities.":[144],"The":[145],"resulting":[146],"plan":[148],"jointly":[149],"balances":[150],"importance":[152],"matching":[154],"cost,":[155],"while":[156,199],"total":[158],"defines":[160],"difficulty":[163],"pair,":[167],"use":[170],"allocate":[172],"budgets":[174],"dynamically.":[175],"Experiments":[176],"six":[178],"benchmarks":[179],"spanning":[180],"video":[181],"question":[182],"answering":[183],"grounding":[186],"show":[187],"OTT-Vid":[189],"preserves":[190],"95.8%":[191],"VQA":[193],"73.9%":[195],"VTG":[197],"performance":[198],"retaining":[200],"only":[201],"10%":[202],"tokens,":[204],"consistently":[205],"outperforming":[206],"state-of-the-art":[208],"training-free":[209],"methods.":[211]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-14T00:00:00"}
