{"id":"https://openalex.org/W7164814569","doi":"https://doi.org/10.1145/3805622.3810660","title":"Calibrate and Aggregate: Cross-Modal Retrieval with Distribution Alignment and Token Reduction","display_name":"Calibrate and Aggregate: Cross-Modal Retrieval with Distribution Alignment and Token Reduction","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164814569","doi":"https://doi.org/10.1145/3805622.3810660"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810660","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810660","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810660","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101782369","display_name":"Ziyi Wu","orcid":"https://orcid.org/0000-0002-2440-323X"},"institutions":[{"id":"https://openalex.org/I38877650","display_name":"Zhengzhou University","ror":"https://ror.org/04ypx8c21","country_code":"CN","type":"education","lineage":["https://openalex.org/I38877650"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ziyi Wu","raw_affiliation_strings":["School of Computer and Artificial Intelligence, Zhengzhou university, Zhengzhou, Henan, China"],"raw_orcid":"https://orcid.org/0000-0002-2440-323X","affiliations":[{"raw_affiliation_string":"School of Computer and Artificial Intelligence, Zhengzhou university, Zhengzhou, Henan, China","institution_ids":["https://openalex.org/I38877650"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138684469","display_name":"Jiahao Li","orcid":"https://orcid.org/0009-0000-6840-9978"},"institutions":[{"id":"https://openalex.org/I38877650","display_name":"Zhengzhou University","ror":"https://ror.org/04ypx8c21","country_code":"CN","type":"education","lineage":["https://openalex.org/I38877650"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiahao Li","raw_affiliation_strings":["School of Computer and Artificial Intelligence, Zhengzhou University, Zhengzhou, Henan, China"],"raw_orcid":"https://orcid.org/0009-0000-6840-9978","affiliations":[{"raw_affiliation_string":"School of Computer and Artificial Intelligence, Zhengzhou University, Zhengzhou, Henan, China","institution_ids":["https://openalex.org/I38877650"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017257056","display_name":"Mingyuan Jiu","orcid":"https://orcid.org/0000-0002-4868-0709"},"institutions":[{"id":"https://openalex.org/I38877650","display_name":"Zhengzhou University","ror":"https://ror.org/04ypx8c21","country_code":"CN","type":"education","lineage":["https://openalex.org/I38877650"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mingyuan Jiu","raw_affiliation_strings":["School of Computer and Artificial Intelligence, Zhengzhou University, Zhengzhou, Henan, China"],"raw_orcid":"https://orcid.org/0000-0002-4868-0709","affiliations":[{"raw_affiliation_string":"School of Computer and Artificial Intelligence, Zhengzhou University, Zhengzhou, Henan, China","institution_ids":["https://openalex.org/I38877650"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035853607","display_name":"Hongru Zhao","orcid":"https://orcid.org/0000-0003-1437-9456"},"institutions":[{"id":"https://openalex.org/I38877650","display_name":"Zhengzhou University","ror":"https://ror.org/04ypx8c21","country_code":"CN","type":"education","lineage":["https://openalex.org/I38877650"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongru Zhao","raw_affiliation_strings":["School of Computer and Artificial Intelligence, Zhengzhou University, Zhengzhou, Henan, China"],"raw_orcid":"https://orcid.org/0000-0003-1437-9456","affiliations":[{"raw_affiliation_string":"School of Computer and Artificial Intelligence, Zhengzhou University, Zhengzhou, Henan, China","institution_ids":["https://openalex.org/I38877650"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029896607","display_name":"Hichem Sahbi","orcid":"https://orcid.org/0000-0001-6813-9146"},"institutions":[{"id":"https://openalex.org/I1294671590","display_name":"Centre National de la Recherche Scientifique","ror":"https://ror.org/02feahw73","country_code":"FR","type":"government","lineage":["https://openalex.org/I1294671590"]},{"id":"https://openalex.org/I39804081","display_name":"Sorbonne Universit\u00e9","ror":"https://ror.org/02en5vm52","country_code":"FR","type":"education","lineage":["https://openalex.org/I39804081"]},{"id":"https://openalex.org/I51101395","display_name":"Universit\u00e9 Paris 1 Panth\u00e9on-Sorbonne","ror":"https://ror.org/002t25c44","country_code":"FR","type":"education","lineage":["https://openalex.org/I51101395"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Hichem Sahbi","raw_affiliation_strings":["CNRS LIP6, Sorbonne University, Paris, France"],"raw_orcid":"https://orcid.org/0000-0001-6813-9146","affiliations":[{"raw_affiliation_string":"CNRS LIP6, Sorbonne University, Paris, France","institution_ids":["https://openalex.org/I39804081","https://openalex.org/I51101395","https://openalex.org/I1294671590"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5081346568","display_name":"Mingliang Xu","orcid":"https://orcid.org/0000-0002-6885-3451"},"institutions":[{"id":"https://openalex.org/I38877650","display_name":"Zhengzhou University","ror":"https://ror.org/04ypx8c21","country_code":"CN","type":"education","lineage":["https://openalex.org/I38877650"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mingliang Xu","raw_affiliation_strings":["School of Computer and Artificial Intelligence, Zhengzhou University, Zhengzhou, Henan, China"],"raw_orcid":"https://orcid.org/0000-0002-6885-3451","affiliations":[{"raw_affiliation_string":"School of Computer and Artificial Intelligence, Zhengzhou University, Zhengzhou, Henan, China","institution_ids":["https://openalex.org/I38877650"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93459058,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"356","last_page":"365"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9174000024795532,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9174000024795532,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.06759999692440033,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.0034000000450760126,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.510699987411499},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.45489999651908875},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.44040000438690186},{"id":"https://openalex.org/keywords/clutter","display_name":"Clutter","score":0.41130000352859497},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4043000042438507},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.38909998536109924},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.36890000104904175},{"id":"https://openalex.org/keywords/intersection","display_name":"Intersection (aeronautics)","score":0.36800000071525574},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.36480000615119934}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.77920001745224},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6064000129699707},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.510699987411499},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.45489999651908875},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.44040000438690186},{"id":"https://openalex.org/C132094186","wikidata":"https://www.wikidata.org/wiki/Q641585","display_name":"Clutter","level":3,"score":0.41130000352859497},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4043000042438507},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.38909998536109924},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.36890000104904175},{"id":"https://openalex.org/C64543145","wikidata":"https://www.wikidata.org/wiki/Q162942","display_name":"Intersection (aeronautics)","level":2,"score":0.36800000071525574},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.36480000615119934},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.35850000381469727},{"id":"https://openalex.org/C83665646","wikidata":"https://www.wikidata.org/wiki/Q42139305","display_name":"Feature vector","level":2,"score":0.35679998993873596},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.3513000011444092},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3197000026702881},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.3172000050544739},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.31150001287460327},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.3093999922275543},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3091999888420105},{"id":"https://openalex.org/C202615002","wikidata":"https://www.wikidata.org/wiki/Q783507","display_name":"Differentiable function","level":2,"score":0.3019999861717224},{"id":"https://openalex.org/C2780762811","wikidata":"https://www.wikidata.org/wiki/Q1784941","display_name":"Cosine similarity","level":3,"score":0.2939000129699707},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.28360000252723694},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.28279998898506165},{"id":"https://openalex.org/C165838908","wikidata":"https://www.wikidata.org/wiki/Q736777","display_name":"Calibration","level":2,"score":0.28200000524520874},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2768999934196472},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.27469998598098755},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.26269999146461487},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.25999999046325684},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.259799987077713}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810660","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810660","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810660","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810660","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5924907192","display_name":null,"funder_award_id":"62272422, U22B2051, 62325602","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W1861492603","https://openalex.org/W2185175083","https://openalex.org/W2745461083","https://openalex.org/W2956018683","https://openalex.org/W2962964995","https://openalex.org/W2963467339","https://openalex.org/W2988823324","https://openalex.org/W3035552787","https://openalex.org/W3081484346","https://openalex.org/W3090449556","https://openalex.org/W3118694826","https://openalex.org/W3135361293","https://openalex.org/W3168433561","https://openalex.org/W3176451698","https://openalex.org/W4283812943","https://openalex.org/W4311415873","https://openalex.org/W4312980231","https://openalex.org/W4376312115","https://openalex.org/W4386066600","https://openalex.org/W4386075677","https://openalex.org/W4390790453","https://openalex.org/W4402660140","https://openalex.org/W4402727885","https://openalex.org/W4403707460","https://openalex.org/W4405740219","https://openalex.org/W4406857712","https://openalex.org/W4406890887","https://openalex.org/W4409959339","https://openalex.org/W7081963526"],"related_works":[],"abstract_inverted_index":{"Cross-modal":[0],"image-text":[1],"retrieval":[2,56,152,158],"remains":[3],"a":[4,67,80,89,109],"fundamental":[5],"challenge":[6],"at":[7,169],"the":[8,34,103],"intersection":[9],"of":[10],"vision":[11],"and":[12,24,42,60,93,134,144,156],"language.":[13],"While":[14],"existing":[15],"methods":[16],"leveraging":[17],"large-scale":[18],"pre-trained":[19],"models":[20],"such":[21],"as":[22],"CLIP":[23],"BERT":[25],"have":[26],"achieved":[27],"notable":[28],"progress,":[29],"they":[30],"often":[31],"struggle":[32],"with":[33,150],"inherent":[35],"modality":[36],"gap,":[37],"background":[38],"clutter":[39],"in":[40],"images,":[41],"computational":[43,128],"inefficiency.":[44],"In":[45],"this":[46],"paper,":[47],"we":[48,107],"propose":[49],"an":[50],"end-to-end":[51],"framework":[52],"for":[53,137],"efficient":[54],"cross-modal":[55],"using":[57],"Distribution":[58],"Alignment":[59],"Token":[61],"Reduction":[62],"(DATR).":[63],"The":[64,164],"model":[65],"employs":[66],"feature":[68],"extraction":[69],"network":[70],"to":[71,117],"extract":[72],"multi-level":[73],"representations,":[74],"which":[75],"are":[76,167],"preliminarily":[77],"aligned":[78],"via":[79],"distribution":[81],"calibration":[82],"module.":[83],"This":[84],"module":[85,113],"maps":[86],"features":[87],"into":[88,123],"Gaussian":[90],"latent":[91],"space":[92],"maximizes":[94],"inter-modal":[95],"mutual":[96],"information":[97],"through":[98],"InfoNCE":[99],"loss,":[100],"effectively":[101],"bridging":[102],"semantic":[104,124],"gap.":[105],"Furthermore,":[106],"incorporate":[108],"differentiable":[110],"token":[111],"reduction":[112],"inspired":[114],"by":[115],"GroupViT":[116],"dynamically":[118],"cluster":[119],"redundant":[120],"visual":[121],"tokens":[122],"groups,":[125],"significantly":[126],"reducing":[127],"overhead.":[129],"Similarity":[130],"computation":[131],"integrates":[132],"global":[133],"local":[135],"alignments":[136],"robust":[138],"matching.":[139],"Extensive":[140],"experiments":[141],"on":[142,162],"Flickr30K":[143],"MSCOCO":[145],"datasets":[146],"demonstrate":[147],"state-of-the-art":[148],"performance,":[149],"image-to-text":[151],"R@1":[153,159],"reaching":[154],"89.6%":[155],"text-to-image":[157],"achieving":[160],"77.2%":[161],"Flickr30K.":[163],"source":[165],"codes":[166],"available":[168],"https://github.com/Wuziyi123/DATR.":[170]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
