{"id":"https://openalex.org/W7164831395","doi":"https://doi.org/10.1145/3805622.3810846","title":"GDT-VLM: Global Distribution Modeling for Visual Token Compression in Efficient Multimodal Large Language Models","display_name":"GDT-VLM: Global Distribution Modeling for Visual Token Compression in Efficient Multimodal Large Language Models","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164831395","doi":"https://doi.org/10.1145/3805622.3810846"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810846","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810846","pdf_url":null,"source":null,"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810846","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052272812","display_name":"Jiangtao Xie","orcid":"https://orcid.org/0000-0002-9714-0215"},"institutions":[{"id":"https://openalex.org/I27357992","display_name":"Dalian University of Technology","ror":"https://ror.org/023hj5876","country_code":"CN","type":"education","lineage":["https://openalex.org/I27357992"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiangtao Xie","raw_affiliation_strings":["Dalian University of Technology, Dalian, Liaoning, China"],"raw_orcid":"https://orcid.org/0000-0002-9714-0215","affiliations":[{"raw_affiliation_string":"Dalian University of Technology, Dalian, Liaoning, China","institution_ids":["https://openalex.org/I27357992"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138670108","display_name":"Junjie Wu","orcid":"https://orcid.org/0009-0005-1056-6563"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junjie Wu","raw_affiliation_strings":["Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0009-0005-1056-6563","affiliations":[{"raw_affiliation_string":"Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071501441","display_name":"Z H Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhaolin Zhang","raw_affiliation_strings":["Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0009-0005-2727-820X","affiliations":[{"raw_affiliation_string":"Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100658200","display_name":"Qilong Wang","orcid":"https://orcid.org/0000-0002-3765-9787"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qilong Wang","raw_affiliation_strings":["Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0000-0002-3765-9787","affiliations":[{"raw_affiliation_string":"Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5070918242","display_name":"Peihua Li","orcid":"https://orcid.org/0000-0001-7229-3867"},"institutions":[{"id":"https://openalex.org/I27357992","display_name":"Dalian University of Technology","ror":"https://ror.org/023hj5876","country_code":"CN","type":"education","lineage":["https://openalex.org/I27357992"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peihua Li","raw_affiliation_strings":["Dalian University of Technology, Dalian, Liaoning, China"],"raw_orcid":"https://orcid.org/0000-0001-7229-3867","affiliations":[{"raw_affiliation_string":"Dalian University of Technology, Dalian, Liaoning, China","institution_ids":["https://openalex.org/I27357992"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93669018,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1711","last_page":"1715"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.39480000734329224,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.39480000734329224,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.28139999508857727,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.0575999990105629,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/compression","display_name":"Compression (physics)","score":0.503000020980835},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.4442000091075897},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3165000081062317},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.311599999666214},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.2897999882698059},{"id":"https://openalex.org/keywords/distribution","display_name":"Distribution (mathematics)","score":0.2822999954223633}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7142000198364258},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5232999920845032},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.503000020980835},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.4442000091075897},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.36160001158714294},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3165000081062317},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.311599999666214},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3010999858379364},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2962000072002411},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2897999882698059},{"id":"https://openalex.org/C110121322","wikidata":"https://www.wikidata.org/wiki/Q865811","display_name":"Distribution (mathematics)","level":2,"score":0.2822999954223633},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2775999903678894},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.2685000002384186},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.25619998574256897},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.25369998812675476},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2508000135421753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810846","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810846","pdf_url":null,"source":null,"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810846","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810846","pdf_url":null,"source":null,"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":8,"referenced_works":["https://openalex.org/W2023205960","https://openalex.org/W2601789736","https://openalex.org/W2962761264","https://openalex.org/W4312935555","https://openalex.org/W4390873312","https://openalex.org/W4390874079","https://openalex.org/W4413157571","https://openalex.org/W7160088235"],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"Large":[1],"Language":[2],"Models":[3],"(MLLMs)":[4],"have":[5],"achieved":[6],"remarkable":[7],"success,":[8],"yet":[9,117],"the":[10,42,122],"massive":[11],"number":[12],"of":[13,64,89,125],"visual":[14,31,47,65,69,90,98],"tokens":[15,91],"per":[16],"image":[17],"imposes":[18],"a":[19,29,51,81,114,135,157],"heavy":[20],"inference":[21],"burden.":[22],"Existing":[23],"methods":[24],"attempt":[25],"extreme":[26,68],"compression":[27,71],"with":[28],"single":[30,136],"token":[32,70,137],"via":[33],"spatial":[34],"reduction":[35],"or":[36],"cross-modal":[37],"attention,":[38],"but":[39],"often":[40],"overlook":[41],"statistical":[43,62,141],"information":[44,133],"inherent":[45],"in":[46,72,134],"tokens,":[48,127],"leading":[49],"to":[50],"suboptimal":[52],"efficiency-effectiveness":[53,159],"trade-off.":[54,160],"In":[55],"this":[56,76],"paper,":[57],"we":[58,78],"show":[59,148],"that":[60,84,149],"effective":[61],"characterization":[63],"features":[66,99],"benefits":[67],"efficient":[73,93],"MLLMs.":[74],"To":[75],"end,":[77],"propose":[79],"GDT-VLM,":[80],"novel":[82],"architecture":[83],"exploits":[85],"global":[86,104],"distribution":[87],"modeling":[88,102],"for":[92],"compression.":[94],"Specifically,":[95],"GDT-VLM":[96,129],"encodes":[97],"by":[100],"jointly":[101],"their":[103],"first-order":[105],"(GAP)":[106],"and":[107],"second-order":[108],"(Brownian":[109],"Distance":[110],"Covariance)":[111],"statistics,":[112],"enabling":[113],"more":[115],"expressive":[116],"compact":[118,131],"representation.":[119],"By":[120],"capturing":[121],"holistic":[123],"characteristics":[124],"vision":[126,132],"our":[128,150],"yields":[130],"while":[138,155],"effectively":[139],"preserving":[140],"content.":[142],"Extensive":[143],"experiments":[144],"on":[145],"7":[146],"benchmarks":[147],"approach":[151],"achieves":[152],"competitive":[153],"accuracy":[154],"offering":[156],"favorable":[158]},"counts_by_year":[],"updated_date":"2026-06-17T06:14:20.161405","created_date":"2026-06-16T00:00:00"}
