{"id":"https://openalex.org/W7154400690","doi":"https://doi.org/10.48550/arxiv.2604.11240","title":"Decoupled Similarity for Task-Aware Token Pruning in Large Vision-Language Models","display_name":"Decoupled Similarity for Task-Aware Token Pruning in Large Vision-Language Models","publication_year":2026,"publication_date":"2026-04-13","ids":{"openalex":"https://openalex.org/W7154400690","doi":"https://doi.org/10.48550/arxiv.2604.11240"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.11240","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11240","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.11240","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125404097","display_name":"Kexin Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Kexin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133591554","display_name":"Jing Xiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiao, Jing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133568403","display_name":"Chaofeng Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Chaofeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133587410","display_name":"Geyong Min","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Min, Geyong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029271867","display_name":"Guibo Zhu","orcid":"https://orcid.org/0000-0001-8293-3952"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Guibo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133582258","display_name":"Jinqiao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jinqiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133619771","display_name":"Liang Liao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liao, Liang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8851000070571899,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8851000070571899,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.037300001829862595,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.016100000590085983,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.8822000026702881},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6687999963760376},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.5764999985694885},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.5486999750137329},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.4602999985218048},{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance (law)","score":0.4440999925136566},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.4212000072002411}],"concepts":[{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.8822000026702881},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7641000151634216},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6687999963760376},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5947999954223633},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.5764999985694885},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.5486999750137329},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.4602999985218048},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.4440999925136566},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.424699991941452},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.4212000072002411},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3614000082015991},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.3222000002861023},{"id":"https://openalex.org/C31170391","wikidata":"https://www.wikidata.org/wiki/Q188619","display_name":"Hierarchy","level":2,"score":0.2581000030040741},{"id":"https://openalex.org/C160086991","wikidata":"https://www.wikidata.org/wiki/Q5939193","display_name":"Human visual system model","level":3,"score":0.2574999928474426},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.25529998540878296},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.25459998846054077}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.11240","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11240","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.11240","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11240","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Token":[0],"pruning":[1,46,70,113,125,129],"has":[2],"emerged":[3],"as":[4],"an":[5],"effective":[6],"approach":[7],"to":[8,49,81],"reduce":[9],"the":[10,72,115,176],"substantial":[11],"computational":[12],"overhead":[13],"of":[14,117,169,175],"Large":[15],"Vision-Language":[16],"Models":[17],"(LVLMs)":[18],"by":[19,165],"discarding":[20],"less":[21],"informative":[22],"visual":[23,73,87,103,108,121,170],"tokens":[24],"while":[25,172],"preserving":[26],"performance.":[27,178],"However,":[28],"existing":[29],"methods":[30,144],"typically":[31],"rely":[32],"on":[33],"individual":[34],"attention":[35,51],"sources":[36],"from":[37,107],"different":[38],"LVLM":[39],"components,":[40],"resulting":[41],"in":[42,145],"incomplete":[43],"and":[44,89,120,136,148,159],"suboptimal":[45],"decisions":[47],"due":[48],"biased":[50],"distributions.":[52],"To":[53],"address":[54],"this":[55],"problem,":[56],"we":[57],"propose":[58],"DeSAP,":[59],"a":[60,78,154,160],"novel":[61],"Decoupled":[62],"Similarity-Aware":[63],"Pruning":[64],"method":[65],"for":[66,96],"precise,":[67],"task-aware":[68],"token":[69,112],"within":[71],"encoder.":[74],"Specifically,":[75],"DeSAP":[76,110,140,152],"introduces":[77],"decoupled":[79,100],"similarity":[80,101],"capture":[82],"fine-grained":[83],"cross-modal":[84],"relevance":[85],"between":[86],"features":[88],"text":[90],"tokens,":[91,171],"providing":[92],"explicit":[93],"task-related":[94,119],"guidance":[95,116],"pruning.":[97],"By":[98],"integrating":[99],"with":[102],"saliency":[104],"signals":[105],"derived":[106],"attention,":[109],"performs":[111],"under":[114,127],"both":[118,146],"cues,":[122],"enabling":[123],"robust":[124],"even":[126],"aggressive":[128],"ratios.":[130],"Extensive":[131],"experiments":[132],"across":[133],"diverse":[134],"benchmarks":[135],"architectures":[137],"show":[138],"that":[139],"consistently":[141],"outperforms":[142],"SOTA":[143],"accuracy":[147],"efficiency.":[149],"On":[150],"LLaVA-1.5-7B,":[151],"achieves":[153],"10":[155],"times":[156,162],"FLOPs":[157],"reduction":[158],"2.3":[161],"prefill":[163],"speedup":[164],"retaining":[166],"only":[167],"11.1%":[168],"maintaining":[173],"98.1%":[174],"original":[177]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-15T00:00:00"}
