{"id":"https://openalex.org/W7133352472","doi":"https://doi.org/10.48550/arxiv.2603.01082","title":"Beyond Global Similarity: Towards Fine-Grained, Multi-Condition Multimodal Retrieval","display_name":"Beyond Global Similarity: Towards Fine-Grained, Multi-Condition Multimodal Retrieval","publication_year":2026,"publication_date":"2026-03-01","ids":{"openalex":"https://openalex.org/W7133352472","doi":"https://doi.org/10.48550/arxiv.2603.01082"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.01082","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01082","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.01082","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052404451","display_name":"Xuan L\u00fc","orcid":"https://orcid.org/0000-0003-4886-0918"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Lu, Xuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127910075","display_name":"Kangle Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Kangle","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Huang, Haohang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Haohang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127942470","display_name":"Rui Meng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Meng, Rui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128018174","display_name":"Wenjun Zeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeng, Wenjun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5127877804","display_name":"Xiaoyu Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Xiaoyu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5052404451"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9745000004768372,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9745000004768372,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.006500000134110451,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.0017000000225380063,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7508999705314636},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.6355000138282776},{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.6284000277519226},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5479000210762024},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.39559999108314514},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.3856000006198883},{"id":"https://openalex.org/keywords/pointwise","display_name":"Pointwise","score":0.34940001368522644},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.3465000092983246}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8174999952316284},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7508999705314636},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.6355000138282776},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.6284000277519226},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5479000210762024},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5325999855995178},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.39559999108314514},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3889999985694885},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3856000006198883},{"id":"https://openalex.org/C2777984123","wikidata":"https://www.wikidata.org/wiki/Q9248237","display_name":"Pointwise","level":2,"score":0.34940001368522644},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.3465000092983246},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.34049999713897705},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.32190001010894775},{"id":"https://openalex.org/C2777655017","wikidata":"https://www.wikidata.org/wiki/Q1501161","display_name":"Toolbox","level":2,"score":0.30079999566078186},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.2994000017642975},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.29109999537467957},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.28610000014305115},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.28110000491142273},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2757999897003174},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.25679999589920044},{"id":"https://openalex.org/C61423126","wikidata":"https://www.wikidata.org/wiki/Q187432","display_name":"Scripting language","level":2,"score":0.2526000142097473},{"id":"https://openalex.org/C185874996","wikidata":"https://www.wikidata.org/wiki/Q269699","display_name":"Interdependence","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.01082","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01082","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.01082","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01082","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2],"multimodal":[3,14,121,177],"large":[4],"language":[5],"models":[6,105],"(MLLMs)":[7],"have":[8],"substantially":[9],"expanded":[10],"the":[11],"capabilities":[12],"of":[13,119],"retrieval,":[15],"enabling":[16],"systems":[17],"to":[18,64,106,126],"align":[19],"and":[20,25,79,84,101,123,153,172,182,187],"retrieve":[21],"information":[22],"across":[23,48,139],"visual":[24,100,142],"textual":[26,102,148],"modalities.":[27,49],"Yet,":[28],"existing":[29],"benchmarks":[30],"largely":[31],"focus":[32],"on":[33],"coarse-grained":[34],"or":[35],"single-condition":[36],"alignment,":[37],"overlooking":[38],"real-world":[39],"scenarios":[40],"where":[41],"user":[42],"queries":[43],"specify":[44],"multiple":[45],"interdependent":[46],"constraints":[47],"To":[50],"bridge":[51],"this":[52],"gap,":[53],"we":[54],"introduce":[55],"MCMR":[56,73,168],"(Multi-Conditional":[57],"Multimodal":[58],"Retrieval):":[59],"a":[60,116,170],"large-scale":[61],"benchmark":[62,115,174],"designed":[63],"evaluate":[65],"fine-grained,":[66],"multi-condition":[67],"cross-modal":[68],"retrieval":[69,178],"under":[70],"natural-language":[71],"queries.":[72],"spans":[74],"five":[75],"product":[76],"domains:":[77],"upper":[78],"bottom":[80],"clothing,":[81],"jewelry,":[82],"shoes,":[83],"furniture.":[85],"It":[86],"also":[87],"preserves":[88],"rich":[89],"long-form":[90],"metadata":[91,149],"essential":[92],"for":[93,112,175],"compositional":[94],"matching.":[95],"Each":[96],"query":[97],"integrates":[98],"complementary":[99],"attributes,":[103],"requiring":[104],"jointly":[107],"satisfy":[108],"all":[109],"specified":[110],"conditions":[111],"relevance.":[113],"We":[114],"diverse":[117],"suite":[118],"MLLM-based":[120,155],"retrievers":[122],"vision-language":[124],"rerankers":[125,157],"assess":[127],"their":[128],"condition-aware":[129],"reasoning":[130],"abilities.":[131],"Experimental":[132],"results":[133],"reveal:":[134],"(i)":[135],"distinct":[136],"modality":[137],"asymmetries":[138],"models;":[140],"(ii)":[141],"cues":[143],"dominate":[144],"early-rank":[145],"precision,":[146],"while":[147],"stabilizes":[150],"long-tail":[151],"ordering;":[152],"(iii)":[154],"pointwise":[156],"markedly":[158],"improve":[159],"fine-grained":[160],"matching":[161],"by":[162],"explicitly":[163],"verifying":[164],"query-candidate":[165],"consistency.":[166],"Overall,":[167],"establishes":[169],"challenging":[171],"diagnostic":[173],"advancing":[176],"toward":[179],"compositional,":[180],"constraint-aware,":[181],"interpretable":[183],"understanding.":[184],"Our":[185],"code":[186],"dataset":[188],"is":[189],"available":[190],"at":[191],"https://github.com/EIT-NLP/MCMR":[192]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2026-03-04T00:00:00"}
