{"id":"https://openalex.org/W4414459425","doi":"https://doi.org/10.1109/wacv61042.2026.00691","title":"EVTP-IVS: Effective Visual Token Pruning For Unifying Instruction Visual Segmentation In Multi-Modal Large Language Models","display_name":"EVTP-IVS: Effective Visual Token Pruning For Unifying Instruction Visual Segmentation In Multi-Modal Large Language Models","publication_year":2026,"publication_date":"2026-03-06","ids":{"openalex":"https://openalex.org/W4414459425","doi":"https://doi.org/10.1109/wacv61042.2026.00691"},"language":"en","primary_location":{"id":"doi:10.1109/wacv61042.2026.00691","is_oa":false,"landing_page_url":"https://doi.org/10.1109/wacv61042.2026.00691","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2508.11886","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100943601","display_name":"Wenhui Zhu","orcid":null},"institutions":[{"id":"https://openalex.org/I55732556","display_name":"Arizona State University","ror":"https://ror.org/03efmqc40","country_code":"US","type":"education","lineage":["https://openalex.org/I55732556"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Wenhui Zhu","raw_affiliation_strings":["Arizona State University,AZ,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Arizona State University,AZ,USA","institution_ids":["https://openalex.org/I55732556"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100657345","display_name":"Xiwen Chen","orcid":"https://orcid.org/0000-0003-0170-4459"},"institutions":[{"id":"https://openalex.org/I8078737","display_name":"Clemson University","ror":"https://ror.org/037s24f05","country_code":"US","type":"education","lineage":["https://openalex.org/I8078737"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiwen Chen","raw_affiliation_strings":["Clemson University,SC,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Clemson University,SC,USA","institution_ids":["https://openalex.org/I8078737"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100424170","display_name":"Zhipeng Wang","orcid":"https://orcid.org/0000-0003-0780-0070"},"institutions":[{"id":"https://openalex.org/I1316064682","display_name":"LinkedIn (United States)","ror":"https://ror.org/02fyxhe35","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I1316064682"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhipeng Wang","raw_affiliation_strings":["LinkedIn Corporation,CA,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"LinkedIn Corporation,CA,USA","institution_ids":["https://openalex.org/I1316064682"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056385133","display_name":"Shao Tang","orcid":null},"institutions":[{"id":"https://openalex.org/I1316064682","display_name":"LinkedIn (United States)","ror":"https://ror.org/02fyxhe35","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I1316064682"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shao Tang","raw_affiliation_strings":["LinkedIn Corporation,CA,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"LinkedIn Corporation,CA,USA","institution_ids":["https://openalex.org/I1316064682"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100670911","display_name":"Sayan Ghosh","orcid":"https://orcid.org/0000-0001-8758-7657"},"institutions":[{"id":"https://openalex.org/I1316064682","display_name":"LinkedIn (United States)","ror":"https://ror.org/02fyxhe35","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I1316064682"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sayan Ghosh","raw_affiliation_strings":["LinkedIn Corporation,CA,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"LinkedIn Corporation,CA,USA","institution_ids":["https://openalex.org/I1316064682"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Xuanzhao Dong","orcid":null},"institutions":[{"id":"https://openalex.org/I55732556","display_name":"Arizona State University","ror":"https://ror.org/03efmqc40","country_code":"US","type":"education","lineage":["https://openalex.org/I55732556"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xuanzhao Dong","raw_affiliation_strings":["Arizona State University,AZ,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Arizona State University,AZ,USA","institution_ids":["https://openalex.org/I55732556"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047843370","display_name":"Rajat Koner","orcid":"https://orcid.org/0000-0003-3441-8192"},"institutions":[{"id":"https://openalex.org/I8204097","display_name":"Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen","ror":"https://ror.org/05591te55","country_code":"DE","type":"education","lineage":["https://openalex.org/I8204097"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Rajat Koner","raw_affiliation_strings":["Ludwig Maximilian University of Munich,Munich,Germany"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Ludwig Maximilian University of Munich,Munich,Germany","institution_ids":["https://openalex.org/I8204097"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100740828","display_name":"Yalin Wang","orcid":"https://orcid.org/0000-0002-6241-735X"},"institutions":[{"id":"https://openalex.org/I55732556","display_name":"Arizona State University","ror":"https://ror.org/03efmqc40","country_code":"US","type":"education","lineage":["https://openalex.org/I55732556"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yalin Wang","raw_affiliation_strings":["Arizona State University,AZ,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Arizona State University,AZ,USA","institution_ids":["https://openalex.org/I55732556"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5100943601"],"corresponding_institution_ids":["https://openalex.org/I55732556"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.00415619,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"7158","last_page":"7167"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.8669000267982483},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.8022000193595886},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6087999939918518},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5360999703407288},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4180000126361847},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.4115000069141388},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3808000087738037}],"concepts":[{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.8669000267982483},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8309999704360962},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.8022000193595886},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6535999774932861},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6087999939918518},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5360999703407288},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4180000126361847},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.4115000069141388},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.38670000433921814},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3808000087738037},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3630000054836273},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3407000005245209},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.33719998598098755},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.33079999685287476},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2921000123023987},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.28790000081062317},{"id":"https://openalex.org/C125308379","wikidata":"https://www.wikidata.org/wiki/Q363057","display_name":"Market segmentation","level":2,"score":0.27799999713897705},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.27469998598098755},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.26809999346733093},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2614000141620636}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/wacv61042.2026.00691","is_oa":false,"landing_page_url":"https://doi.org/10.1109/wacv61042.2026.00691","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2508.11886","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2508.11886","pdf_url":"https://arxiv.org/pdf/2508.11886","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2508.11886","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2508.11886","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2508.11886","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2508.11886","pdf_url":"https://arxiv.org/pdf/2508.11886","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Instructed":[0],"Visual":[1],"Segmentation":[2],"(IVS)":[3],"tasks":[4,139],"require":[5],"segmenting":[6],"objects":[7],"in":[8,38,46],"images":[9],"or":[10],"videos":[11],"based":[12],"on":[13,28,124,137,142],"natural":[14],"language":[15,21],"instructions.":[16],"While":[17],"recent":[18],"multimodal":[19],"large":[20],"models":[22],"(MLLMs)":[23],"have":[24],"achieved":[25],"strong":[26,51],"performance":[27],"IVS,":[29,97],"their":[30],"inference":[31],"cost":[32],"remains":[33],"a":[34,50,65,74,90],"major":[35],"bottleneck,":[36],"particularly":[37],"video.":[39],"We":[40,113],"empirically":[41],"analyze":[42],"visual":[43,92],"token":[44,55,69,93],"sampling":[45],"MLLMs":[47],"and":[48,57,67,140],"observe":[49],"correlation":[52],"between":[53],"subset":[54,79],"coverage":[56],"segmentation":[58],"performance.":[59],"This":[60],"motivates":[61],"our":[62,121,130],"design":[63],"of":[64,80,152],"simple":[66],"effective":[68],"pruning":[70,94,161,165],"method":[71,95,131,156],"that":[72,129],"selects":[73],"compact":[75],"yet":[76],"spatially":[77],"representative":[78],"tokens":[81],"to":[82,109,119,134],"accelerate":[83],"inference.":[84],"In":[85],"this":[86],"paper,":[87],"we":[88],"introduce":[89],"novel":[91],"for":[96],"called":[98],"EVTP-IV,":[99],"which":[100],"builds":[101],"upon":[102],"the":[103,153],"k-center":[104],"by":[105],"integrating":[106],"spatial":[107],"information":[108],"ensure":[110],"better":[111],"coverage.":[112],"further":[114],"provide":[115],"an":[116],"information-theoretic":[117],"analysis":[118],"support":[120],"design.":[122],"Experiments":[123],"standard":[125],"IVS":[126],"benchmarks":[127],"show":[128],"achieves":[132],"up":[133],"5X":[135],"speed-up":[136],"video":[138],"3.5X":[141],"image":[143],"tasks,":[144],"while":[145],"maintaining":[146],"comparable":[147],"accuracy":[148],"using":[149],"only":[150],"20%":[151],"tokens.":[154],"Our":[155],"also":[157],"consistently":[158],"outperforms":[159],"state-of-the-art":[160],"baselines":[162],"under":[163],"varying":[164],"ratios.":[166]},"counts_by_year":[],"updated_date":"2026-05-07T06:04:25.777469","created_date":"2025-10-10T00:00:00"}
