{"id":"https://openalex.org/W4414360029","doi":"https://doi.org/10.24963/ijcai.2025/224","title":"MSCI: Addressing CLIP's Inherent Limitations for Compositional Zero-Shot Learning","display_name":"MSCI: Addressing CLIP's Inherent Limitations for Compositional Zero-Shot Learning","publication_year":2025,"publication_date":"2025-09-01","ids":{"openalex":"https://openalex.org/W4414360029","doi":"https://doi.org/10.24963/ijcai.2025/224"},"language":"en","primary_location":{"id":"doi:10.24963/ijcai.2025/224","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/224","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100371957","display_name":"Yue Wang","orcid":"https://orcid.org/0000-0001-6271-6547"},"institutions":[{"id":"https://openalex.org/I9842412","display_name":"Nanjing University of Aeronautics and Astronautics","ror":"https://ror.org/01scyh794","country_code":"CN","type":"education","lineage":["https://openalex.org/I9842412"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yue Wang","raw_affiliation_strings":["Nanjing University of Aeronautics and Astronautics, Nanjing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Nanjing University of Aeronautics and Astronautics, Nanjing, China","institution_ids":["https://openalex.org/I9842412"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101545233","display_name":"Shuai Xu","orcid":"https://orcid.org/0000-0001-7656-2761"},"institutions":[{"id":"https://openalex.org/I27357992","display_name":"Dalian University of Technology","ror":"https://ror.org/023hj5876","country_code":"CN","type":"education","lineage":["https://openalex.org/I27357992"]},{"id":"https://openalex.org/I9842412","display_name":"Nanjing University of Aeronautics and Astronautics","ror":"https://ror.org/01scyh794","country_code":"CN","type":"education","lineage":["https://openalex.org/I9842412"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuai Xu","raw_affiliation_strings":["Key Laboratory of Social Computing and Cognitive Intelligence (Dalian University of Technology), Ministry of Education, China","Nanjing University of Aeronautics and Astronautics, Nanjing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Key Laboratory of Social Computing and Cognitive Intelligence (Dalian University of Technology), Ministry of Education, China","institution_ids":["https://openalex.org/I27357992"]},{"raw_affiliation_string":"Nanjing University of Aeronautics and Astronautics, Nanjing, China","institution_ids":["https://openalex.org/I9842412"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104000466","display_name":"Xuelin Zhu","orcid":null},"institutions":[{"id":"https://openalex.org/I14243506","display_name":"Hong Kong Polytechnic University","ror":"https://ror.org/0030zas98","country_code":"HK","type":"education","lineage":["https://openalex.org/I14243506"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Xuelin Zhu","raw_affiliation_strings":["The Hong Kong Polytechnic University, Hong Kong, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Hong Kong Polytechnic University, Hong Kong, China","institution_ids":["https://openalex.org/I14243506"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5059165070","display_name":"Yicong Li","orcid":"https://orcid.org/0000-0001-5108-4038"},"institutions":[{"id":"https://openalex.org/I9842412","display_name":"Nanjing University of Aeronautics and Astronautics","ror":"https://ror.org/01scyh794","country_code":"CN","type":"education","lineage":["https://openalex.org/I9842412"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yicong Li","raw_affiliation_strings":["Nanjing University of Aeronautics and Astronautics, Nanjing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Nanjing University of Aeronautics and Astronautics, Nanjing, China","institution_ids":["https://openalex.org/I9842412"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5100371957"],"corresponding_institution_ids":["https://openalex.org/I9842412"],"apc_list":null,"apc_paid":null,"fwci":3.3116,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.9279805,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"2009","last_page":"2017"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11894","display_name":"Radiology practices and education","score":0.8844000101089478,"subfield":{"id":"https://openalex.org/subfields/2741","display_name":"Radiology, Nuclear Medicine and Imaging"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T11894","display_name":"Radiology practices and education","score":0.8844000101089478,"subfield":{"id":"https://openalex.org/subfields/2741","display_name":"Radiology, Nuclear Medicine and Imaging"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11775","display_name":"COVID-19 diagnosis using AI","score":0.8467000126838684,"subfield":{"id":"https://openalex.org/subfields/2741","display_name":"Radiology, Nuclear Medicine and Imaging"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11381","display_name":"Infectious Diseases and Tuberculosis","score":0.8148000240325928,"subfield":{"id":"https://openalex.org/subfields/2746","display_name":"Surgery"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.603600025177002},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.48510000109672546},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.47380000352859497},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3797000050544739},{"id":"https://openalex.org/keywords/spatial-analysis","display_name":"Spatial analysis","score":0.29750001430511475},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.2922999858856201}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7835000157356262},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.603600025177002},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5184000134468079},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.48510000109672546},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.47380000352859497},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3797000050544739},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3734999895095825},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.36970001459121704},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.29750001430511475},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2922999858856201},{"id":"https://openalex.org/C2777055276","wikidata":"https://www.wikidata.org/wiki/Q7936580","display_name":"Visual approach","level":2,"score":0.28859999775886536},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2872999906539917},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.271699994802475},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.25850000977516174},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.25099998712539673}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.24963/ijcai.2025/224","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/224","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Compositional":[0],"Zero-Shot":[1],"Learning":[2],"(CZSL)":[3],"aims":[4],"to":[5,27,73,143,146],"recognize":[6],"unseen":[7],"state-object":[8],"combinations":[9],"by":[10],"leveraging":[11],"known":[12],"combinations.":[13],"Existing":[14],"studies":[15],"basically":[16],"rely":[17],"on":[18,129,150],"the":[19,106,119,138,157,162],"cross-modal":[20],"alignment":[21],"capabilities":[22],"of":[23,161],"CLIP":[24],"but":[25],"tend":[26],"overlook":[28],"its":[29,39],"limitations":[30],"in":[31],"capturing":[32],"fine-grained":[33,111],"local":[34,75,112,125],"features,":[35,88],"which":[36],"arise":[37],"from":[38,63,77,85],"architectural":[40],"and":[41,59,81,124,159,166],"training":[42],"paradigm.":[43],"To":[44],"address":[45],"this":[46],"issue,":[47],"we":[48,68],"propose":[49],"a":[50,100],"Multi-Stage":[51],"Cross-modal":[52],"Interaction":[53],"(MSCI)":[54],"model":[55],"that":[56],"effectively":[57],"explores":[58],"utilizes":[60],"intermediate-layer":[61],"information":[62,76,84,92,127],"CLIP's":[64],"visual":[65,79,87,113,126],"encoder.":[66],"Specifically,":[67],"design":[69],"two":[70],"self-adaptive":[71],"aggregators":[72],"extract":[74],"low-level":[78],"features":[80],"integrate":[82],"global":[83,123],"high-level":[86],"respectively.":[89],"These":[90],"key":[91],"are":[93,168],"progressively":[94],"incorporated":[95],"into":[96],"textual":[97],"representations":[98],"through":[99],"stage-by-stage":[101],"interaction":[102],"mechanism,":[103],"significantly":[104],"enhancing":[105],"model\u2019s":[107],"perception":[108],"capability":[109],"for":[110],"information.":[114],"Additionally,":[115],"MSCI":[116],"dynamically":[117],"adjusts":[118],"attention":[120],"weights":[121],"between":[122],"based":[128],"different":[130,135],"combinations,":[131],"as":[132,134],"well":[133],"elements":[136],"within":[137],"same":[139],"combination,":[140],"allowing":[141],"it":[142],"flexibly":[144],"adapt":[145],"diverse":[147],"scenarios.":[148],"Experiments":[149],"three":[151],"widely":[152],"used":[153],"datasets":[154],"fully":[155],"validate":[156],"effectiveness":[158],"superiority":[160],"proposed":[163],"model.":[164],"Data":[165],"code":[167],"available":[169],"at":[170],"https://github.com/ltpwy/MSCI.":[171]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
