{"id":"https://openalex.org/W7125895181","doi":"https://doi.org/10.1109/smc58881.2025.11343217","title":"Bidirectional Multimodal Knowledge Augmentation with Sparse Representation for Image-Text Retrieval","display_name":"Bidirectional Multimodal Knowledge Augmentation with Sparse Representation for Image-Text Retrieval","publication_year":2025,"publication_date":"2025-10-05","ids":{"openalex":"https://openalex.org/W7125895181","doi":"https://doi.org/10.1109/smc58881.2025.11343217"},"language":null,"primary_location":{"id":"doi:10.1109/smc58881.2025.11343217","is_oa":false,"landing_page_url":"https://doi.org/10.1109/smc58881.2025.11343217","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123523738","display_name":"Hengchang Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I28006308","display_name":"Shandong Normal University","ror":"https://ror.org/01wy3h363","country_code":"CN","type":"education","lineage":["https://openalex.org/I28006308"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Hengchang Wang","raw_affiliation_strings":["Shandong Normal University,School of Information Science and Engineering,Jinan,China,250358"],"affiliations":[{"raw_affiliation_string":"Shandong Normal University,School of Information Science and Engineering,Jinan,China,250358","institution_ids":["https://openalex.org/I28006308"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124092716","display_name":"Li Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I28006308","display_name":"Shandong Normal University","ror":"https://ror.org/01wy3h363","country_code":"CN","type":"education","lineage":["https://openalex.org/I28006308"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Li Liu","raw_affiliation_strings":["Shandong Normal University,School of Information Science and Engineering,Jinan,China,250358"],"affiliations":[{"raw_affiliation_string":"Shandong Normal University,School of Information Science and Engineering,Jinan,China,250358","institution_ids":["https://openalex.org/I28006308"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124094247","display_name":"Huaxiang Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I28006308","display_name":"Shandong Normal University","ror":"https://ror.org/01wy3h363","country_code":"CN","type":"education","lineage":["https://openalex.org/I28006308"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huaxiang Zhang","raw_affiliation_strings":["Shandong Normal University,School of Information Science and Engineering,Jinan,China,250358"],"affiliations":[{"raw_affiliation_string":"Shandong Normal University,School of Information Science and Engineering,Jinan,China,250358","institution_ids":["https://openalex.org/I28006308"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079360792","display_name":"Xinfeng Dong","orcid":"https://orcid.org/0009-0001-6159-635X"},"institutions":[{"id":"https://openalex.org/I28006308","display_name":"Shandong Normal University","ror":"https://ror.org/01wy3h363","country_code":"CN","type":"education","lineage":["https://openalex.org/I28006308"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinfeng Dong","raw_affiliation_strings":["Shandong Normal University,School of Information Science and Engineering,Jinan,China,250358"],"affiliations":[{"raw_affiliation_string":"Shandong Normal University,School of Information Science and Engineering,Jinan,China,250358","institution_ids":["https://openalex.org/I28006308"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5123481853","display_name":"Hao Du","orcid":null},"institutions":[{"id":"https://openalex.org/I28006308","display_name":"Shandong Normal University","ror":"https://ror.org/01wy3h363","country_code":"CN","type":"education","lineage":["https://openalex.org/I28006308"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Du","raw_affiliation_strings":["Shandong Normal University,School of Information Science and Engineering,Jinan,China,250358"],"affiliations":[{"raw_affiliation_string":"Shandong Normal University,School of Information Science and Engineering,Jinan,China,250358","institution_ids":["https://openalex.org/I28006308"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5123523738"],"corresponding_institution_ids":["https://openalex.org/I28006308"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.68422015,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"635","last_page":"640"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9519000053405762,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9519000053405762,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.02969999983906746,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.00559999980032444,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.7196999788284302},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.642799973487854},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5827000141143799},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5454999804496765},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.5394999980926514},{"id":"https://openalex.org/keywords/sparse-approximation","display_name":"Sparse approximation","score":0.4828999936580658},{"id":"https://openalex.org/keywords/transformation","display_name":"Transformation (genetics)","score":0.4593000113964081},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4235000014305115},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4124000072479248},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4000000059604645}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7870000004768372},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.7196999788284302},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.642799973487854},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6238999962806702},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5827000141143799},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5454999804496765},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.5394999980926514},{"id":"https://openalex.org/C124066611","wikidata":"https://www.wikidata.org/wiki/Q28684319","display_name":"Sparse approximation","level":2,"score":0.4828999936580658},{"id":"https://openalex.org/C204241405","wikidata":"https://www.wikidata.org/wiki/Q461499","display_name":"Transformation (genetics)","level":3,"score":0.4593000113964081},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4235000014305115},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4124000072479248},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4000000059604645},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.385699987411499},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.3808000087738037},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.36809998750686646},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.328900009393692},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.3197999894618988},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3156999945640564},{"id":"https://openalex.org/C194232998","wikidata":"https://www.wikidata.org/wiki/Q1606712","display_name":"Transition (genetics)","level":3,"score":0.3012000024318695},{"id":"https://openalex.org/C165443888","wikidata":"https://www.wikidata.org/wiki/Q1482183","display_name":"Transformation matrix","level":3,"score":0.2962000072002411},{"id":"https://openalex.org/C189950617","wikidata":"https://www.wikidata.org/wiki/Q937228","display_name":"Property (philosophy)","level":2,"score":0.29490000009536743},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.28679999709129333},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.28619998693466187},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.2856000065803528},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.2824999988079071},{"id":"https://openalex.org/C2781122975","wikidata":"https://www.wikidata.org/wiki/Q16928266","display_name":"Semantic feature","level":2,"score":0.28130000829696655},{"id":"https://openalex.org/C83665646","wikidata":"https://www.wikidata.org/wiki/Q42139305","display_name":"Feature vector","level":2,"score":0.27730000019073486},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.2752000093460083},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.26089999079704285},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.25940001010894775},{"id":"https://openalex.org/C161301231","wikidata":"https://www.wikidata.org/wiki/Q3478658","display_name":"Knowledge representation and reasoning","level":2,"score":0.25859999656677246},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.2581999897956848}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/smc58881.2025.11343217","is_oa":false,"landing_page_url":"https://doi.org/10.1109/smc58881.2025.11343217","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2185175083","https://openalex.org/W2987671777","https://openalex.org/W3118694826","https://openalex.org/W4312933868","https://openalex.org/W4360897523","https://openalex.org/W4386065291","https://openalex.org/W4386071498","https://openalex.org/W4386071757","https://openalex.org/W4386072365","https://openalex.org/W4387967913","https://openalex.org/W4391744111","https://openalex.org/W4392678259","https://openalex.org/W4396712842","https://openalex.org/W4402715976","https://openalex.org/W4403780697","https://openalex.org/W4406890763","https://openalex.org/W4410153232"],"related_works":[],"abstract_inverted_index":{"In":[0,41],"cross-modal":[1,31,87,114,150],"retrieval,":[2],"the":[3,11,35,69,93,96,127,133,154,173],"core":[4],"challenge":[5],"of":[6,13,37],"image-text":[7],"retrieval":[8],"lies":[9],"in":[10,158],"mismatch":[12],"heterogeneous":[14],"modality":[15],"representation":[16,147],"spaces.":[17],"Traditional":[18],"methods":[19,171],"overly":[20],"rely":[21],"on":[22,112,172],"inter-modal":[23],"relationships":[24],"or":[25],"optimize":[26],"a":[27,46,57,77,104,142],"single":[28],"modality,":[29],"neglecting":[30],"feature":[32,83,117,129,134],"understanding":[33],"and":[34,68,90,122,175],"preservation":[36],"original":[38],"data":[39],"details.":[40],"this":[42],"paper,":[43],"we":[44,55,102],"propose":[45],"multimodal":[47],"collaborative":[48],"knowledge":[49,79],"augmentation":[50],"framework.":[51],"Unlike":[52],"single-path":[53],"enhancement,":[54],"design":[56,103],"novel":[58],"generation-reconstruction":[59],"dual-path":[60],"architecture.":[61],"The":[62],"image-to-text":[63],"generation":[64],"driven":[65,72],"by":[66,73],"BLIP-2":[67],"text-to-visual":[70],"reconstruction":[71],"Stable":[74],"Diffusion":[75],"form":[76],"closed-loop":[78],"cycle,":[80],"achieving":[81],"multi-granular":[82],"enhancement":[84],"through":[85,116],"bidirectional":[86],"semantic":[88],"transformation":[89],"deeply":[91],"constructing":[92],"relationship":[94],"within":[95],"visual-language":[97],"joint":[98],"embedding":[99],"space.":[100],"Then,":[101],"sparse-gated":[105],"dynamic":[106,123,138],"aggregation":[107],"mechanism,":[108],"which":[109],"precisely":[110],"focuses":[111],"key":[113],"semantics":[115],"decoupling,":[118],"spatially":[119],"sparse":[120],"masking,":[121],"weight":[124],"allocation.":[125],"Additionally,":[126],"temporal-progressive":[128],"fusion":[130,139],"strategy":[131],"controls":[132],"integration":[135],"ratio":[136],"via":[137],"coefficients,":[140],"realizing":[141],"smooth":[143],"transition":[144],"from":[145],"single-modal":[146],"learning":[148],"to":[149],"association":[151],"reinforcement,":[152],"overcoming":[153],"modal":[155],"interference":[156],"issue":[157],"traditional":[159],"fixed-weight":[160],"fusion.":[161],"Experimental":[162],"results":[163],"demonstrate":[164],"that":[165],"our":[166],"method":[167],"outperforms":[168],"existing":[169],"mainstream":[170],"Flickr30K":[174],"MS-COCO":[176],"datasets.":[177]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2026-01-29T00:00:00"}
