{"id":"https://openalex.org/W7137963802","doi":"https://doi.org/10.1609/aaai.v40i14.38212","title":"Instruction-Guided Cross-Modal Clustering for Training-Free Visual Token Pruning in Vision-Language Models","display_name":"Instruction-Guided Cross-Modal Clustering for Training-Free Visual Token Pruning in Vision-Language Models","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7137963802","doi":"https://doi.org/10.1609/aaai.v40i14.38212"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i14.38212","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i14.38212","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i14.38212","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049189518","display_name":"Yunqian Yu","orcid":"https://orcid.org/0009-0001-2228-6930"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yunqian Yu","raw_affiliation_strings":["School of Information and Software Engineering, University of Electronic Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"School of Information and Software Engineering, University of Electronic Science and Technology of China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129658510","display_name":"Biao Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Biao Chen","raw_affiliation_strings":["School of Information and Software Engineering, University of Electronic Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"School of Information and Software Engineering, University of Electronic Science and Technology of China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030160680","display_name":"Yong Zhang","orcid":"https://orcid.org/0009-0005-4602-3202"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yunya Zhang","raw_affiliation_strings":["School of Information and Software Engineering, University of Electronic Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"School of Information and Software Engineering, University of Electronic Science and Technology of China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129704211","display_name":"Tonglan Xie","orcid":null},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tonglan Xie","raw_affiliation_strings":["School of Information and Software Engineering, University of Electronic Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"School of Information and Software Engineering, University of Electronic Science and Technology of China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071936558","display_name":"Mengmeng Jing","orcid":"https://orcid.org/0000-0002-0693-2197"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengmeng Jing","raw_affiliation_strings":["School of Information and Software Engineering, University of Electronic Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"School of Information and Software Engineering, University of Electronic Science and Technology of China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129706027","display_name":"Lin Zuo","orcid":null},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lin Zuo","raw_affiliation_strings":["School of Information and Software Engineering, University of Electronic Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"School of Information and Software Engineering, University of Electronic Science and Technology of China","institution_ids":["https://openalex.org/I150229711"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5049189518"],"corresponding_institution_ids":["https://openalex.org/I150229711"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.22537313,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"14","first_page":"12213","last_page":"12221"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.982200026512146,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.982200026512146,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.003800000064074993,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0017000000225380063,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.6735000014305115},{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.6721000075340271},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.4948999881744385},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.4544999897480011},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4496000111103058},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.42910000681877136},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4203999936580658},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3571000099182129}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.784600019454956},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.6735000014305115},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.6721000075340271},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6158000230789185},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.4948999881744385},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.46799999475479126},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.4544999897480011},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4496000111103058},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.42910000681877136},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4203999936580658},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.40059998631477356},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3571000099182129},{"id":"https://openalex.org/C60008888","wikidata":"https://www.wikidata.org/wiki/Q6031013","display_name":"Information bottleneck method","level":3,"score":0.35100001096725464},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3422999978065491},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.32109999656677246},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.3059999942779541},{"id":"https://openalex.org/C121934690","wikidata":"https://www.wikidata.org/wiki/Q1084","display_name":"Noun","level":2,"score":0.30570000410079956},{"id":"https://openalex.org/C167984511","wikidata":"https://www.wikidata.org/wiki/Q17003931","display_name":"Brown clustering","level":5,"score":0.28439998626708984},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.2621000111103058},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.26080000400543213},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.25690001249313354},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.25209999084472656}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i14.38212","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i14.38212","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i14.38212","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i14.38212","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.7665976881980896,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"vision-language":[1],"models":[2,25],"(LVLMs)":[3],"have":[4],"demonstrated":[5],"remarkable":[6],"capabilities":[7],"in":[8,23,34,66],"understanding":[9],"multimodal":[10,68],"data":[11],"such":[12],"as":[13,186],"images":[14],"and":[15,37,63,78,124,202,239],"text.":[16],"However,":[17],"the":[18,58,67,71,90,95,102,119,129,137,148,184,194,273],"number":[19],"of":[20,30,97,118,139,172,264,272],"visual":[21,64,125,173,196,216,265],"tokens":[22,174,217,266],"these":[24],"often":[26,56],"far":[27],"exceeds":[28],"that":[29,175,244],"textual":[31,99],"tokens,":[32],"resulting":[33],"substantial":[35],"redundancy":[36],"high":[38,86],"inference":[39],"costs.":[40],"Existing":[41],"pruning":[42,87,103,159,227,251],"methods":[43],"primarily":[44],"rely":[45],"on":[46,128,143,232],"either":[47],"unimodal":[48],"information":[49,131],"or":[50],"cross-modal":[51,130,191],"attention":[52,76,166],"mechanisms.":[53],"The":[54],"former":[55],"overlooks":[57],"semantic":[59,200],"alignment":[60],"between":[61,121],"instructions":[62,123],"representations":[65],"space,":[69],"while":[70,205,276],"latter":[72],"is":[73],"prone":[74],"to":[75,81,167,189,224],"drift":[77],"dispersion,":[79],"leading":[80],"significant":[82],"performance":[83,248],"degradation":[84],"under":[85,258],"ratios.":[88],"All":[89],"above":[91],"issues":[92],"stem":[93],"from":[94,183],"lack":[96],"effective":[98,107],"guidance":[100],"during":[101],"process.":[104],"To":[105,198],"identify":[106],"informational":[108],"cues":[109],"for":[110],"guiding":[111],"pruning,":[112],"we":[113,146,209],"conduct":[114],"an":[115,211,259],"in-depth":[116],"analysis":[117],"interaction":[120],"language":[122],"features":[126],"based":[127],"bottleneck":[132],"attribution":[133],"(CIBA)":[134],"method,":[135,155],"revealing":[136],"presence":[138],"noun":[140],"anchors.":[141],"Based":[142],"this":[144],"analysis,":[145],"propose":[147],"Instruction-Guided":[149],"Cross-Modal":[150],"Clustering":[151],"Token":[152],"Pruning":[153],"(ICCTP)":[154],"a":[156,169,225],"plug-and-play,":[157],"training-free":[158],"paradigm.":[160],"Specifically,":[161],"ICCTP":[162,231,245,269],"first":[163],"leverages":[164],"global":[165,177,203],"retain":[168],"small":[170],"set":[171],"preserve":[176],"context.":[178],"It":[179],"then":[180],"extracts":[181],"nouns":[182],"instruction":[185],"clustering":[187,192],"centers":[188],"perform":[190],"over":[193],"remaining":[195],"tokens.":[197],"balance":[199],"diversity":[201],"relevance":[204],"reducing":[206,277],"intra-cluster":[207],"redundancy,":[208],"design":[210],"importance":[212],"scoring":[213],"mechanism.":[214],"Finally,":[215],"within":[218],"each":[219],"cluster":[220],"are":[221,267],"pruned":[222],"according":[223],"specified":[226],"ratio.":[228],"We":[229],"evaluate":[230],"multiple":[233],"VLM":[234],"architectures,":[235],"including":[236],"LLaVA-1.5-7B,":[237],"LLaVA-1.5-13B,":[238],"LLaVA-NeXT-7B.":[240],"Experimental":[241],"results":[242],"show":[243],"maintains":[246],"strong":[247],"across":[249],"various":[250],"rates":[252],"without":[253],"requiring":[254],"retraining.":[255],"Notably,":[256],"even":[257],"extreme":[260],"setting":[261],"where":[262],"94.4%":[263],"removed,":[268],"retains":[270],"90.02%":[271],"original":[274],"accuracy":[275],"TFLOPs":[278],"by":[279],"82.36%.":[280]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
