{"id":"https://openalex.org/W7164811900","doi":"https://doi.org/10.1145/3805622.3810824","title":"M-STAR: Multi-view Semantic Topology Alignment with Reasoning from VLMs for Image-Text Retrieval","display_name":"M-STAR: Multi-view Semantic Topology Alignment with Reasoning from VLMs for Image-Text Retrieval","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164811900","doi":"https://doi.org/10.1145/3805622.3810824"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810824","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810824","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810824","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5138641693","display_name":"Xuewen He","orcid":"https://orcid.org/0009-0000-9517-7826"},"institutions":[{"id":"https://openalex.org/I126924076","display_name":"Chongqing Normal University","ror":"https://ror.org/01dcw5w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I126924076"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuewen He","raw_affiliation_strings":["Chongqing Normal University, Chongqing, China"],"raw_orcid":"https://orcid.org/0009-0000-9517-7826","affiliations":[{"raw_affiliation_string":"Chongqing Normal University, Chongqing, China","institution_ids":["https://openalex.org/I126924076"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072410355","display_name":"Ya Guo","orcid":"https://orcid.org/0000-0002-8016-988X"},"institutions":[{"id":"https://openalex.org/I126924076","display_name":"Chongqing Normal University","ror":"https://ror.org/01dcw5w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I126924076"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuning Guo","raw_affiliation_strings":["Chongqing Normal University, Chongqing, China"],"raw_orcid":"https://orcid.org/0009-0008-8078-1754","affiliations":[{"raw_affiliation_string":"Chongqing Normal University, Chongqing, China","institution_ids":["https://openalex.org/I126924076"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128163061","display_name":"Fumao Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I126924076","display_name":"Chongqing Normal University","ror":"https://ror.org/01dcw5w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I126924076"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fumao Xu","raw_affiliation_strings":["Chongqing Normal University, Chongqing, China"],"raw_orcid":"https://orcid.org/0009-0008-2081-4315","affiliations":[{"raw_affiliation_string":"Chongqing Normal University, Chongqing, China","institution_ids":["https://openalex.org/I126924076"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100686151","display_name":"Mingyong Li","orcid":"https://orcid.org/0000-0002-5517-3633"},"institutions":[{"id":"https://openalex.org/I126924076","display_name":"Chongqing Normal University","ror":"https://ror.org/01dcw5w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I126924076"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mingyong Li","raw_affiliation_strings":["Chongqing Normal University, Chongqing, China"],"raw_orcid":"https://orcid.org/0000-0002-5517-3633","affiliations":[{"raw_affiliation_string":"Chongqing Normal University, Chongqing, China","institution_ids":["https://openalex.org/I126924076"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93430685,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"118","last_page":"127"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9101999998092651,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9101999998092651,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.052000001072883606,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.009399999864399433,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6071000099182129},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5598000288009644},{"id":"https://openalex.org/keywords/semantic-mapping","display_name":"Semantic mapping","score":0.4584999978542328},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.45249998569488525},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4440999925136566},{"id":"https://openalex.org/keywords/subspace-topology","display_name":"Subspace topology","score":0.4318000078201294},{"id":"https://openalex.org/keywords/semantic-integration","display_name":"Semantic integration","score":0.412200003862381},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.40939998626708984},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.38019999861717224}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7688000202178955},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6071000099182129},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5598000288009644},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5396999716758728},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.4584999978542328},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.45249998569488525},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4440999925136566},{"id":"https://openalex.org/C32834561","wikidata":"https://www.wikidata.org/wiki/Q660730","display_name":"Subspace topology","level":2,"score":0.4318000078201294},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4174000024795532},{"id":"https://openalex.org/C110903229","wikidata":"https://www.wikidata.org/wiki/Q7449064","display_name":"Semantic integration","level":4,"score":0.412200003862381},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.40939998626708984},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.38019999861717224},{"id":"https://openalex.org/C198942812","wikidata":"https://www.wikidata.org/wiki/Q496618","display_name":"Semantic property","level":2,"score":0.35749998688697815},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3319999873638153},{"id":"https://openalex.org/C2778493491","wikidata":"https://www.wikidata.org/wiki/Q7449072","display_name":"Semantic matching","level":3,"score":0.3199000060558319},{"id":"https://openalex.org/C511149849","wikidata":"https://www.wikidata.org/wiki/Q7449051","display_name":"Semantic computing","level":3,"score":0.3172999918460846},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.31060001254081726},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.3059000074863434},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.2946999967098236},{"id":"https://openalex.org/C90312973","wikidata":"https://www.wikidata.org/wiki/Q7449052","display_name":"Semantic data model","level":2,"score":0.28529998660087585},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.2799000144004822},{"id":"https://openalex.org/C12362212","wikidata":"https://www.wikidata.org/wiki/Q728435","display_name":"Linear subspace","level":2,"score":0.2759000062942505},{"id":"https://openalex.org/C202708506","wikidata":"https://www.wikidata.org/wiki/Q7449050","display_name":"Semantic compression","level":5,"score":0.2705000042915344},{"id":"https://openalex.org/C197914299","wikidata":"https://www.wikidata.org/wiki/Q18650","display_name":"Semantic memory","level":3,"score":0.26190000772476196},{"id":"https://openalex.org/C37926939","wikidata":"https://www.wikidata.org/wiki/Q7449061","display_name":"Semantic equivalence","level":4,"score":0.25760000944137573},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.25690001249313354},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2540999948978424},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.25380000472068787}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810824","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810824","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810824","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810824","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.7290802597999573,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W1905882502","https://openalex.org/W2185175083","https://openalex.org/W2948077755","https://openalex.org/W2956018683","https://openalex.org/W2962964995","https://openalex.org/W2964727037","https://openalex.org/W2985076077","https://openalex.org/W2988823324","https://openalex.org/W3010277541","https://openalex.org/W3035454331","https://openalex.org/W3035524453","https://openalex.org/W3092820619","https://openalex.org/W3110042533","https://openalex.org/W3118694826","https://openalex.org/W3138516171","https://openalex.org/W3155230099","https://openalex.org/W3168433561","https://openalex.org/W3175616662","https://openalex.org/W3175888430","https://openalex.org/W3203447128","https://openalex.org/W4210894218","https://openalex.org/W4226538672","https://openalex.org/W4312761738","https://openalex.org/W4323338501","https://openalex.org/W4386065291","https://openalex.org/W4386071498","https://openalex.org/W4386071700","https://openalex.org/W4386071757","https://openalex.org/W4390603585","https://openalex.org/W4391216079","https://openalex.org/W4392172944","https://openalex.org/W4402727895","https://openalex.org/W4406890887","https://openalex.org/W4408056946","https://openalex.org/W4408803975","https://openalex.org/W4409364271","https://openalex.org/W4411635487","https://openalex.org/W4415708226","https://openalex.org/W7133182869","https://openalex.org/W7133200878","https://openalex.org/W7160024138"],"related_works":[],"abstract_inverted_index":{"Cross-modal":[0],"image\u2013text":[1],"retrieval":[2],"serves":[3],"as":[4],"a":[5,107,122,151],"fundamental":[6],"bridge":[7],"between":[8],"visual":[9,20,26,133,160],"perception":[10],"and":[11,40,78,92,157,182,192,201],"linguistic":[12],"cognition.":[13],"However,":[14],"the":[15,113,129,138,155,163,170,190],"inherent":[16,38],"semantic":[17,31,89,124,165,208],"entanglement":[18],"of":[19,132,159],"data":[21],"poses":[22],"significant":[23],"challenges:":[24],"unstructured":[25],"content":[27],"typically":[28],"encapsulates":[29],"holistic":[30],"information,":[32,50],"whereas":[33],"structured":[34,76,130],"textual":[35],"descriptions":[36],"exhibit":[37],"subjectivity":[39],"locality.":[41],"Existing":[42],"paradigms":[43],"struggle":[44],"to":[45,65,86,127],"capture":[46],"this":[47,67],"asymmetric":[48],"cross-modal":[49],"ideally":[51],"yielding":[52],"only":[53],"averaged":[54],"representations":[55],"that":[56,196],"suppress":[57],"fine-grained":[58],"details.":[59],"Although":[60],"set-based":[61],"embedding":[62],"methods":[63],"attempt":[64],"alleviate":[66],"asymmetry,":[68],"existing":[69],"approaches":[70],"often":[71],"lack":[72],"guidance":[73],"from":[74,104],"explicit":[75],"semantics":[77],"focus":[79],"solely":[80],"on":[81,189],"local":[82],"view":[83],"optimization,":[84],"leading":[85],"stochasticity":[87],"in":[88,205],"subspace":[90],"decomposition":[91],"gradient":[93],"sparsity.":[94],"This":[95],"paper":[96],"proposes":[97],"Multi-view":[98],"Semantic":[99,114],"Topology":[100],"Alignment":[101,173],"with":[102],"Reasoning":[103],"VLMs":[105],"(M-STAR),":[106],"multi-view":[108],"modeling":[109],"framework.":[110],"We":[111],"design":[112],"Prior":[115],"Acquisition":[116],"via":[117,150],"MLLM":[118],"(SPAM)":[119],"module,":[120,144],"employing":[121],"dual-granularity":[123],"encoding":[125],"mechanism":[126],"guide":[128],"reconstruction":[131],"features.":[134],"Furthermore,":[135],"we":[136,168],"introduce":[137],"Multi-View":[139,172],"Global":[140],"Instance":[141],"Discrimination":[142],"(MGID)":[143],"which":[145],"imposes":[146],"cross-view":[147],"discriminative":[148],"constraints":[149],"centroid":[152],"pool,":[153],"ensuring":[154],"uniqueness":[156],"robustness":[158,204],"embeddings":[161],"within":[162],"global":[164],"space.":[166],"Finally,":[167],"propose":[169],"Holistic":[171],"(HMA)":[174],"optimization":[175],"strategy,":[176],"facilitating":[177],"collaborative":[178],"learning":[179],"across":[180],"subspaces":[181],"significantly":[183],"enhancing":[184],"inter-view":[185],"diversity.":[186],"Extensive":[187],"experiments":[188],"Flickr30K":[191],"MS-COCO":[193],"datasets":[194],"demonstrate":[195],"M-STAR":[197],"achieves":[198],"state-of-the-art":[199],"performance":[200],"exhibits":[202],"superior":[203],"handling":[206],"complex":[207],"scenarios.":[209]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
