{"id":"https://openalex.org/W7140231574","doi":"https://doi.org/10.48550/arxiv.2603.21232","title":"QMoP: Query Guided Mixture-of-Projector for Efficient Visual Token Compression","display_name":"QMoP: Query Guided Mixture-of-Projector for Efficient Visual Token Compression","publication_year":2026,"publication_date":"2026-03-22","ids":{"openalex":"https://openalex.org/W7140231574","doi":"https://doi.org/10.48550/arxiv.2603.21232"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.21232","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21232","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.21232","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Li, Zhongyang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Zhongyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Yaqian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yaqian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Fang, Faming","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fang, Faming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Takezoe, Rinyoichi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Takezoe, Rinyoichi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Bo, Zi-Hao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bo, Zi-Hao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Qian, Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qian, Cheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Guang, Mo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guang, Mo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Guixu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Guixu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Long, Kaiwen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Long, Kaiwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7678999900817871,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7678999900817871,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.12129999697208405,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.02319999970495701,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6524999737739563},{"id":"https://openalex.org/keywords/aggregate","display_name":"Aggregate (composite)","score":0.4945000112056732},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.46889999508857727},{"id":"https://openalex.org/keywords/heuristics","display_name":"Heuristics","score":0.42719998955726624},{"id":"https://openalex.org/keywords/liveness","display_name":"Liveness","score":0.3950999975204468},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.36090001463890076},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.36079999804496765},{"id":"https://openalex.org/keywords/bespoke","display_name":"Bespoke","score":0.35910001397132874}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8651999831199646},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6524999737739563},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.4945000112056732},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46950000524520874},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.46889999508857727},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.42719998955726624},{"id":"https://openalex.org/C15569618","wikidata":"https://www.wikidata.org/wiki/Q3561421","display_name":"Liveness","level":2,"score":0.3950999975204468},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.36090001463890076},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.36079999804496765},{"id":"https://openalex.org/C44210515","wikidata":"https://www.wikidata.org/wiki/Q16968978","display_name":"Bespoke","level":2,"score":0.35910001397132874},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.3391999900341034},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3091000020503998},{"id":"https://openalex.org/C177606310","wikidata":"https://www.wikidata.org/wiki/Q5674297","display_name":"Adaptability","level":2,"score":0.30239999294281006},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.30090001225471497},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.29179999232292175},{"id":"https://openalex.org/C24028149","wikidata":"https://www.wikidata.org/wiki/Q7094056","display_name":"Online aggregation","level":5,"score":0.2913999855518341},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.289000004529953},{"id":"https://openalex.org/C160086991","wikidata":"https://www.wikidata.org/wiki/Q5939193","display_name":"Human visual system model","level":3,"score":0.2782999873161316},{"id":"https://openalex.org/C136085584","wikidata":"https://www.wikidata.org/wiki/Q910289","display_name":"Overlay","level":2,"score":0.27320000529289246},{"id":"https://openalex.org/C178278151","wikidata":"https://www.wikidata.org/wiki/Q7936607","display_name":"Visual memory","level":3,"score":0.26829999685287476},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2623000144958496},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26030001044273376},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.2563999891281128},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.25360000133514404},{"id":"https://openalex.org/C189645446","wikidata":"https://www.wikidata.org/wiki/Q350865","display_name":"Mirroring","level":2,"score":0.2535000145435333},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.21232","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21232","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.21232","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21232","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"large":[1],"language":[2],"models":[3],"suffer":[4],"from":[5,124],"severe":[6],"computational":[7],"and":[8,31,62,91,120,132,194,201],"memory":[9],"bottlenecks,":[10],"as":[11],"the":[12,112,122,143,146,157,172],"number":[13],"of":[14,20,148,159],"visual":[15,33,68,103,130,177],"tokens":[16,34,69],"far":[17],"exceeds":[18],"that":[19,44,65,183],"textual":[21,133],"tokens.":[22],"While":[23],"recent":[24],"methods":[25],"employ":[26],"projector":[27],"modules":[28],"to":[29,100,141],"align":[30],"compress":[32],"into":[35],"text-aligned":[36],"features,":[37],"they":[38],"typically":[39],"depend":[40],"on":[41,128,186],"fixed":[42],"heuristics":[43],"limit":[45],"adaptability":[46],"across":[47],"diverse":[48],"scenarios.":[49],"In":[50],"this":[51],"paper,":[52],"we":[53,110,163],"first":[54],"propose":[55],"Query":[56,113],"Guided":[57,114],"Mixture-of-Projector":[58],"(QMoP),":[59],"a":[60,75,83,93,167],"novel":[61],"flexible":[63],"framework":[64],"adaptively":[66,106],"compresses":[67],"via":[70],"three":[71],"collaborative":[72],"branches:":[73],"(1)":[74],"pooling-based":[76],"branch":[77,85,95],"for":[78,86,96,170],"coarse-grained":[79],"global":[80],"semantics,":[81],"(2)":[82],"resampler":[84],"extracting":[87],"high-level":[88],"semantic":[89],"representations,":[90],"(3)":[92],"pruning-based":[94],"fine-grained":[97],"token":[98,178],"selection":[99],"preserve":[101],"critical":[102],"detail.":[104],"To":[105,154],"coordinate":[107],"these":[108],"branches,":[109],"introduce":[111],"Router":[115],"(QGR),":[116],"which":[117],"dynamically":[118],"selects":[119],"weights":[121],"outputs":[123],"different":[125],"branches":[126],"based":[127],"both":[129],"input":[131],"queries.":[134],"A":[135],"Mixture-of-Experts-style":[136],"fusion":[137],"mechanism":[138],"is":[139],"designed":[140],"aggregate":[142],"outputs,":[144],"harnessing":[145],"strengths":[147],"each":[149],"strategy":[150],"while":[151],"suppressing":[152],"noise.":[153],"systematically":[155],"evaluate":[156],"effects":[158],"Visual":[160],"Token":[161],"Compression,":[162],"also":[164],"develop":[165],"VTCBench,":[166],"dedicated":[168],"benchmark":[169],"evaluating":[171],"information":[173],"loss":[174],"induced":[175],"by":[176],"compression.":[179],"Extensive":[180],"experiments":[181],"demonstrate":[182],"despite":[184],"relying":[185],"fundamental":[187],"compression":[188],"modules,":[189],"QMoP":[190],"outperforms":[191],"strong":[192],"baselines":[193],"delivers":[195],"significant":[196],"savings":[197],"in":[198],"memory,":[199],"computation,":[200],"inference":[202],"time.":[203]},"counts_by_year":[],"updated_date":"2026-04-25T08:17:42.794288","created_date":"2026-03-25T00:00:00"}
