{"id":"https://openalex.org/W4416288898","doi":"https://doi.org/10.1109/iccv51701.2025.02224","title":"Multi-Granular Spatio-Temporal Token Merging for Training-Free Acceleration of Video LLMs","display_name":"Multi-Granular Spatio-Temporal Token Merging for Training-Free Acceleration of Video LLMs","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4416288898","doi":"https://doi.org/10.1109/iccv51701.2025.02224"},"language":"en","primary_location":{"id":"doi:10.1109/iccv51701.2025.02224","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.02224","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2507.07990","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5045256780","display_name":"Jeongseok Hyun","orcid":"https://orcid.org/0000-0002-8629-3929"},"institutions":[{"id":"https://openalex.org/I193775966","display_name":"Yonsei University","ror":"https://ror.org/01wjejq96","country_code":"KR","type":"education","lineage":["https://openalex.org/I193775966"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jeongseok Hyun","raw_affiliation_strings":["Yonsei University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Yonsei University","institution_ids":["https://openalex.org/I193775966"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010331532","display_name":"Sukjun Hwang","orcid":null},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sukjun Hwang","raw_affiliation_strings":["Carnegie Mellon University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100841867","display_name":"Su Ho Han","orcid":null},"institutions":[{"id":"https://openalex.org/I193775966","display_name":"Yonsei University","ror":"https://ror.org/01wjejq96","country_code":"KR","type":"education","lineage":["https://openalex.org/I193775966"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Su Ho Han","raw_affiliation_strings":["Yonsei University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Yonsei University","institution_ids":["https://openalex.org/I193775966"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Taeoh Kim","orcid":null},"institutions":[{"id":"https://openalex.org/I60922564","display_name":"Naver (South Korea)","ror":"https://ror.org/04nzrnx83","country_code":"KR","type":"company","lineage":["https://openalex.org/I60922564"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Taeoh Kim","raw_affiliation_strings":["NAVER Cloud"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NAVER Cloud","institution_ids":["https://openalex.org/I60922564"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100349171","display_name":"Inwoong Lee","orcid":"https://orcid.org/0000-0003-4356-7616"},"institutions":[{"id":"https://openalex.org/I60922564","display_name":"Naver (South Korea)","ror":"https://ror.org/04nzrnx83","country_code":"KR","type":"company","lineage":["https://openalex.org/I60922564"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Inwoong Lee","raw_affiliation_strings":["NAVER Cloud"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NAVER Cloud","institution_ids":["https://openalex.org/I60922564"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028451951","display_name":"Dongyoon Wee","orcid":"https://orcid.org/0000-0003-0359-146X"},"institutions":[{"id":"https://openalex.org/I60922564","display_name":"Naver (South Korea)","ror":"https://ror.org/04nzrnx83","country_code":"KR","type":"company","lineage":["https://openalex.org/I60922564"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Dongyoon Wee","raw_affiliation_strings":["NAVER Cloud"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NAVER Cloud","institution_ids":["https://openalex.org/I60922564"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044267273","display_name":"Joon\u2010Young Lee","orcid":"https://orcid.org/0000-0003-4822-855X"},"institutions":[{"id":"https://openalex.org/I1306409833","display_name":"Adobe Systems (United States)","ror":"https://ror.org/059tvcg64","country_code":"US","type":"company","lineage":["https://openalex.org/I1306409833"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Joon-Young Lee","raw_affiliation_strings":["Adobe Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Adobe Research","institution_ids":["https://openalex.org/I1306409833"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103036411","display_name":"Seon Joo Kim","orcid":"https://orcid.org/0000-0001-8512-216X"},"institutions":[{"id":"https://openalex.org/I193775966","display_name":"Yonsei University","ror":"https://ror.org/01wjejq96","country_code":"KR","type":"education","lineage":["https://openalex.org/I193775966"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Seon Joo Kim","raw_affiliation_strings":["Yonsei University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Yonsei University","institution_ids":["https://openalex.org/I193775966"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5071547535","display_name":"Minho Shim","orcid":"https://orcid.org/0000-0002-9637-4909"},"institutions":[{"id":"https://openalex.org/I60922564","display_name":"Naver (South Korea)","ror":"https://ror.org/04nzrnx83","country_code":"KR","type":"company","lineage":["https://openalex.org/I60922564"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Minho Shim","raw_affiliation_strings":["NAVER Cloud"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NAVER Cloud","institution_ids":["https://openalex.org/I60922564"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.30622279,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"23990","last_page":"24000"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6119999885559082,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6119999885559082,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.15039999783039093,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.048500001430511475,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7871999740600586},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.6274999976158142},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.5037000179290771},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.48420000076293945},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.3986000120639801},{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.37119999527931213},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.3589000105857849}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8457000255584717},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7871999740600586},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.6274999976158142},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.5037000179290771},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.48420000076293945},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.40209999680519104},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.3986000120639801},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.37119999527931213},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.3589000105857849},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3440999984741211},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.33160001039505005},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.31869998574256897},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.30469998717308044},{"id":"https://openalex.org/C128840427","wikidata":"https://www.wikidata.org/wiki/Q1302174","display_name":"Motion compensation","level":2,"score":0.29580000042915344},{"id":"https://openalex.org/C27602214","wikidata":"https://www.wikidata.org/wiki/Q1868547","display_name":"Locality of reference","level":3,"score":0.2912999987602234},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.2734000086784363},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2696000039577484},{"id":"https://openalex.org/C151416825","wikidata":"https://www.wikidata.org/wiki/Q934791","display_name":"Quadtree","level":2,"score":0.2687000036239624},{"id":"https://openalex.org/C117090137","wikidata":"https://www.wikidata.org/wiki/Q7927977","display_name":"Video post-processing","level":5,"score":0.25119999051094055},{"id":"https://openalex.org/C65483669","wikidata":"https://www.wikidata.org/wiki/Q3536669","display_name":"Video processing","level":2,"score":0.25}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/iccv51701.2025.02224","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.02224","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2507.07990","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2507.07990","pdf_url":"https://arxiv.org/pdf/2507.07990","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2507.07990","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2507.07990","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2507.07990","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2507.07990","pdf_url":"https://arxiv.org/pdf/2507.07990","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5147037385","display_name":null,"funder_award_id":"RS-2025-00554790","funder_id":"https://openalex.org/F4320328359","funder_display_name":"Ministry of Science and ICT, South Korea"},{"id":"https://openalex.org/G7200128492","display_name":null,"funder_award_id":"RS-2020-11201361","funder_id":"https://openalex.org/F4320321314","funder_display_name":"Yonsei University"}],"funders":[{"id":"https://openalex.org/F4320321314","display_name":"Yonsei University","ror":"https://ror.org/01wjejq96"},{"id":"https://openalex.org/F4320322120","display_name":"National Research Foundation of Korea","ror":"https://ror.org/013aysd81"},{"id":"https://openalex.org/F4320328359","display_name":"Ministry of Science and ICT, South Korea","ror":"https://ror.org/01wpjm123"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Video":[0],"large":[1,12],"language":[2],"models":[3],"(LLMs)":[4],"achieve":[5],"strong":[6],"video":[7,51,97],"understanding":[8],"by":[9],"leveraging":[10],"a":[11,31,70,74,103,108,113,118,123,127],"number":[13],"of":[14],"spatio-temporal":[15,33],"tokens,":[16],"but":[17],"suffer":[18],"from":[19],"quadratic":[20],"computational":[21],"scaling":[22],"with":[23,106,121],"token":[24,34,92,115],"count.":[25],"To":[26],"address":[27],"this,":[28],"we":[29],"propose":[30],"training-free":[32],"merging":[35,81,88],"method,":[36],"named":[37],"STTM.":[38],"Our":[39],"key":[40],"insight":[41],"is":[42,132,148],"to":[43],"exploit":[44],"local":[45],"spatial":[46,67],"and":[47,117],"temporal":[48,84],"redundancy":[49],"in":[50,57],"data":[52],"which":[53],"has":[54],"been":[55],"overlooked":[56],"prior":[58],"work.":[59],"STTM":[60,101,131],"first":[61],"transforms":[62],"each":[63],"frame":[64],"into":[65],"multi-granular":[66],"tokens":[68],"using":[69],"coarse-to-fine":[71],"search":[72],"over":[73],"quadtree":[75],"structure,":[76],"then":[77],"performs":[78],"directed":[79],"pairwise":[80],"across":[82,95,138],"the":[83,142],"dimension.":[85],"This":[86],"decomposed":[87],"approach":[89],"outperforms":[90],"existing":[91],"reduction":[93],"methods":[94],"six":[96],"QA":[98],"benchmarks.":[99],"Notably,":[100],"achieves":[102],"2$\\times$":[104],"speed-up":[105,120],"only":[107],"0.5%":[109],"accuracy":[110],"drop":[111,125],"under":[112,126],"50%":[114],"budget,":[116],"3$\\times$":[119],"just":[122],"2%":[124],"30%":[128],"budget.":[129],"Moreover,":[130],"query-agnostic,":[133],"allowing":[134],"KV":[135],"cache":[136],"reuse":[137],"different":[139],"questions":[140],"for":[141],"same":[143],"video.":[144],"The":[145],"project":[146],"page":[147],"available":[149],"at":[150],"https://www.jshyun.me/projects/sttm.":[151]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
