{"id":"https://openalex.org/W4415539819","doi":"https://doi.org/10.1145/3746027.3755216","title":"Reliable Cross-modal Alignment via Prototype Iterative Construction","display_name":"Reliable Cross-modal Alignment via Prototype Iterative Construction","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415539819","doi":"https://doi.org/10.1145/3746027.3755216"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755216","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755216","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5036449183","display_name":"Xiang Ma","orcid":"https://orcid.org/0000-0002-4963-8705"},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xiang Ma","raw_affiliation_strings":["Shandong University, Jinan, Shandong, China"],"affiliations":[{"raw_affiliation_string":"Shandong University, Jinan, Shandong, China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120132166","display_name":"Litian Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I23923803","display_name":"University of Exeter","ror":"https://ror.org/03yghzc09","country_code":"GB","type":"education","lineage":["https://openalex.org/I23923803"]},{"id":"https://openalex.org/I4210155855","display_name":"Devon & Exeter Institution","ror":"https://ror.org/052gw5984","country_code":"GB","type":"nonprofit","lineage":["https://openalex.org/I4210155855"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Litian Xu","raw_affiliation_strings":["The University of Exeter, Exeter, Devon, United Kingdom"],"affiliations":[{"raw_affiliation_string":"The University of Exeter, Exeter, Devon, United Kingdom","institution_ids":["https://openalex.org/I23923803","https://openalex.org/I4210155855"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051677056","display_name":"Lexin Fang","orcid":"https://orcid.org/0000-0003-3243-851X"},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lexin Fang","raw_affiliation_strings":["Shandong University, Jinan, Shandong, China"],"affiliations":[{"raw_affiliation_string":"Shandong University, Jinan, Shandong, China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101753050","display_name":"Caiming Zhang","orcid":"https://orcid.org/0000-0003-0217-1543"},"institutions":[{"id":"https://openalex.org/I154099455","display_name":"Shandong University","ror":"https://ror.org/0207yh398","country_code":"CN","type":"education","lineage":["https://openalex.org/I154099455"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Caiming Zhang","raw_affiliation_strings":["Shandong University, Jinan, Shandong, China"],"affiliations":[{"raw_affiliation_string":"Shandong University, Jinan, Shandong, China","institution_ids":["https://openalex.org/I154099455"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101414718","display_name":"Lizhen Cui","orcid":"https://orcid.org/0000-0002-8262-8883"},"institutions":[{"id":"https://openalex.org/I34949971","display_name":"University of Jinan","ror":"https://ror.org/02mjz6f26","country_code":"CN","type":"education","lineage":["https://openalex.org/I34949971"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lizhen Cui","raw_affiliation_strings":["Shandong University, Jinan, China and The Joint SDU-NTU Centre for Artificial Intelligence Research, Jinan, China"],"affiliations":[{"raw_affiliation_string":"Shandong University, Jinan, China and The Joint SDU-NTU Centre for Artificial Intelligence Research, Jinan, China","institution_ids":["https://openalex.org/I34949971"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5036449183"],"corresponding_institution_ids":["https://openalex.org/I154099455"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.29866648,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"3847","last_page":"3855"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9729999899864197,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9625999927520752,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/weighting","display_name":"Weighting","score":0.6783999800682068},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.582099974155426},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5716000199317932},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.550000011920929},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.5317000150680542},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.45840001106262207},{"id":"https://openalex.org/keywords/semantic-gap","display_name":"Semantic gap","score":0.45590001344680786},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.45089998841285706},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.4309999942779541}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7300000190734863},{"id":"https://openalex.org/C183115368","wikidata":"https://www.wikidata.org/wiki/Q856577","display_name":"Weighting","level":2,"score":0.6783999800682068},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.582099974155426},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5716000199317932},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.550000011920929},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.5317000150680542},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.45840001106262207},{"id":"https://openalex.org/C86034646","wikidata":"https://www.wikidata.org/wiki/Q474311","display_name":"Semantic gap","level":4,"score":0.45590001344680786},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.45089998841285706},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.4309999942779541},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4189000129699707},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.39100000262260437},{"id":"https://openalex.org/C2781122975","wikidata":"https://www.wikidata.org/wiki/Q16928266","display_name":"Semantic feature","level":2,"score":0.3619999885559082},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3578999936580658},{"id":"https://openalex.org/C2777946921","wikidata":"https://www.wikidata.org/wiki/Q7449044","display_name":"Semantic analysis (machine learning)","level":2,"score":0.3483000099658966},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3116999864578247},{"id":"https://openalex.org/C159694833","wikidata":"https://www.wikidata.org/wiki/Q2321565","display_name":"Iterative method","level":2,"score":0.30300000309944153},{"id":"https://openalex.org/C2781089630","wikidata":"https://www.wikidata.org/wiki/Q21856745","display_name":"Realization (probability)","level":2,"score":0.301800012588501},{"id":"https://openalex.org/C193125573","wikidata":"https://www.wikidata.org/wiki/Q7449065","display_name":"Semantic interpretation","level":2,"score":0.30149999260902405},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.3012000024318695},{"id":"https://openalex.org/C33676613","wikidata":"https://www.wikidata.org/wiki/Q13415176","display_name":"Dimension (graph theory)","level":2,"score":0.29429998993873596},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2930999994277954},{"id":"https://openalex.org/C2777462759","wikidata":"https://www.wikidata.org/wiki/Q18395344","display_name":"Word embedding","level":3,"score":0.28690001368522644},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.28220000863075256},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.27889999747276306},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.2732999920845032},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.2712000012397766},{"id":"https://openalex.org/C131584629","wikidata":"https://www.wikidata.org/wiki/Q4308705","display_name":"Coupling (piping)","level":2,"score":0.2687999904155731},{"id":"https://openalex.org/C90312973","wikidata":"https://www.wikidata.org/wiki/Q7449052","display_name":"Semantic data model","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C511149849","wikidata":"https://www.wikidata.org/wiki/Q7449051","display_name":"Semantic computing","level":3,"score":0.2551000118255615},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2542000114917755}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755216","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755216","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G7610069338","display_name":null,"funder_award_id":"92367202","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":6,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W1861492603","https://openalex.org/W2185175083","https://openalex.org/W2912290085","https://openalex.org/W2914306086","https://openalex.org/W2963350250"],"related_works":[],"abstract_inverted_index":{"Cross-modal":[0],"alignment":[1],"is":[2,79,171],"an":[3],"important":[4],"multi-modal":[5],"task,":[6],"aiming":[7],"to":[8,52,80,189],"bridge":[9],"the":[10,26,42,67,87,103,131,144,147,152,155,183,205],"semantic":[11,27,39,88,108,138,156],"gap":[12],"between":[13,29,107],"different":[14],"modalities.":[15],"The":[16,165],"most":[17,91],"reliable":[18],"fundamention":[19],"for":[20,121],"achieving":[21],"this":[22,113,169],"objective":[23],"lies":[24],"in":[25,66],"consistency":[28],"matched":[30],"pairs.":[31],"Conventional":[32],"methods":[33,93,211],"implicitly":[34],"assume":[35],"embeddings":[36],"contain":[37],"solely":[38],"information,":[40,139],"ignoring":[41],"impact":[43],"of":[44,133,154,168,207],"non-semantic":[45,59],"information":[46,53,60],"during":[47,125,146],"alignment,":[48],"which":[49,69,100],"inevitably":[50],"leads":[51],"bias":[54],"or":[55],"even":[56],"loss.":[57],"These":[58],"primarily":[61],"manifest":[62],"as":[63,73,143],"stylistic":[64],"variations":[65],"data,":[68],"we":[70,115,129,158,178],"formally":[71],"define":[72],"style":[74,82,110,123],"information.":[75,89,111],"An":[76],"intuitive":[77],"approach":[78],"separate":[81],"from":[83],"semantics,":[84],"aligning":[85],"only":[86],"However,":[90],"existing":[92],"distinguish":[94],"them":[95],"based":[96],"on":[97,198],"feature":[98,135],"columns,":[99],"cannot":[101],"represent":[102],"complex":[104],"coupling":[105],"relationship":[106],"and":[109,140,177,201],"In":[112],"paper,":[114],"propose":[116,159],"PICO,":[117,208],"a":[118,160,172],"novel":[119],"framework":[120],"suppressing":[122],"interference":[124],"embedding":[126,148],"interaction.":[127,149],"Specifically,":[128],"quantify":[130],"probability":[132],"each":[134],"column":[136],"representing":[137],"regard":[141],"it":[142],"weight":[145,188],"To":[150],"ensure":[151],"reliability":[153],"probability,":[157],"prototype":[161],"iterative":[162],"construction":[163],"method.":[164],"key":[166],"operation":[167],"method":[170],"performance":[173,194],"feedback-based":[174],"weighting":[175],"function,":[176],"have":[179],"theoretically":[180],"proven":[181],"that":[182,191],"function":[184],"can":[185],"assign":[186],"higher":[187,193],"prototypes":[190],"bring":[192],"improvements.":[195],"Extensive":[196],"experiments":[197],"various":[199],"benchmarks":[200],"model":[202],"backbones":[203],"demonstrate":[204],"superiority":[206],"outperforming":[209],"state-of-the-art":[210],"by":[212],"5.2%-14.1%.":[213]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-25T00:00:00"}
