{"id":"https://openalex.org/W4415536975","doi":"https://doi.org/10.1145/3746027.3755509","title":"CSDN: CLIP-Driven Similarity-Aligned Distillation Network for Weakly-Supervised Object Localization","display_name":"CSDN: CLIP-Driven Similarity-Aligned Distillation Network for Weakly-Supervised Object Localization","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415536975","doi":"https://doi.org/10.1145/3746027.3755509"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755509","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755509","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120131387","display_name":"Sifan Zuo","orcid":null},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Sifan Zuo","raw_affiliation_strings":["School of Cyber Science and Engineering, Wuhan University, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"School of Cyber Science and Engineering, Wuhan University, Wuhan, China","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056810715","display_name":"Youfa Liu","orcid":"https://orcid.org/0000-0002-3540-5775"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Youfa Liu","raw_affiliation_strings":["School of Computer Science, Wuhan University, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Wuhan University, Wuhan, China","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5060042752","display_name":"Bo Du","orcid":"https://orcid.org/0000-0002-0059-8458"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bo Du","raw_affiliation_strings":["School of Computer Science, Wuhan University, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Wuhan University, Wuhan, China","institution_ids":["https://openalex.org/I37461747"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5120131387"],"corresponding_institution_ids":["https://openalex.org/I37461747"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.32257931,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"4580","last_page":"4589"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.9941999912261963,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.6104999780654907},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5497999787330627},{"id":"https://openalex.org/keywords/completeness","display_name":"Completeness (order theory)","score":0.459199994802475},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.45910000801086426},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.4449000060558319},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.44130000472068787},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3804999887943268},{"id":"https://openalex.org/keywords/semantic-similarity","display_name":"Semantic similarity","score":0.37940001487731934},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.37790000438690186}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7799000144004822},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6567999720573425},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.6104999780654907},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5497999787330627},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4871000051498413},{"id":"https://openalex.org/C17231256","wikidata":"https://www.wikidata.org/wiki/Q5156540","display_name":"Completeness (order theory)","level":2,"score":0.459199994802475},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.45910000801086426},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.4449000060558319},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.44130000472068787},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3804999887943268},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.37940001487731934},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.37790000438690186},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.3675999939441681},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.34850001335144043},{"id":"https://openalex.org/C62354387","wikidata":"https://www.wikidata.org/wiki/Q875399","display_name":"Boundary (topology)","level":2,"score":0.33739998936653137},{"id":"https://openalex.org/C90312973","wikidata":"https://www.wikidata.org/wiki/Q7449052","display_name":"Semantic data model","level":2,"score":0.336899995803833},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.328900009393692},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3208000063896179},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.32019999623298645},{"id":"https://openalex.org/C32022120","wikidata":"https://www.wikidata.org/wiki/Q797225","display_name":"Interference (communication)","level":3,"score":0.31049999594688416},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2928999960422516},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.2892000079154968},{"id":"https://openalex.org/C85407183","wikidata":"https://www.wikidata.org/wiki/Q1045785","display_name":"Semantic network","level":2,"score":0.28619998693466187},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.2797999978065491},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.2750000059604645},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.27129998803138733},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.27059999108314514},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.2685999870300293},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2662999927997589},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.2653000056743622},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.25429999828338623},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.25}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755509","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755509","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W2138011018","https://openalex.org/W2295107390","https://openalex.org/W2950557962","https://openalex.org/W2964274719","https://openalex.org/W2990371274","https://openalex.org/W3035318183","https://openalex.org/W3095191037","https://openalex.org/W3107169861","https://openalex.org/W3110272085","https://openalex.org/W3142837074","https://openalex.org/W3176171254","https://openalex.org/W3176774696","https://openalex.org/W3203593070","https://openalex.org/W3204484221","https://openalex.org/W3207676276","https://openalex.org/W3207782762","https://openalex.org/W4200631217","https://openalex.org/W4226138051","https://openalex.org/W4307795760","https://openalex.org/W4312570500","https://openalex.org/W4313019229","https://openalex.org/W4313052647","https://openalex.org/W4382465452","https://openalex.org/W4386065896","https://openalex.org/W4386071798","https://openalex.org/W4386075561","https://openalex.org/W4386245233","https://openalex.org/W4387694723","https://openalex.org/W4390872717","https://openalex.org/W4393148942","https://openalex.org/W4402727450","https://openalex.org/W4402727633","https://openalex.org/W4403792225","https://openalex.org/W4409014152"],"related_works":[],"abstract_inverted_index":{"Weakly":[0],"Supervised":[1],"Object":[2],"Localization":[3],"(WSOL)":[4],"relies":[5],"only":[6],"on":[7,84,132,203],"image-level":[8],"labels":[9],"to":[10,59,110,165,192,199,221],"realize":[11],"object":[12,45],"localization,":[13],"significantly":[14,236],"reducing":[15],"the":[16,28,61,91,97,123,149,152,171,178,190,206,238],"cost":[17],"for":[18,253],"fine-grained":[19],"annotations.":[20],"While":[21],"traditional":[22],"CAM-based":[23],"methods":[24],"excel":[25],"at":[26],"identifying":[27],"most":[29],"prominent":[30],"regions":[31],"of":[32,63,94,101,125,243],"objects,":[33],"they":[34],"frequently":[35],"neglect":[36],"other":[37,172],"essential":[38],"components,":[39],"resulting":[40],"in":[41],"partial":[42],"or":[43],"incomplete":[44],"localization.":[46,197],"The":[47],"foreground":[48,204],"prediction":[49],"map":[50],"(FPM)":[51],"generates":[52],"finer-grained":[53],"activation":[54],"maps":[55],"using":[56,175],"underlying":[57],"features":[58,159],"address":[60],"shortcomings":[62],"CAM,":[64],"but":[65],"it":[66],"may":[67],"still":[68],"have":[69],"coverage":[70],"blind":[71],"spots.":[72],"To":[73],"this":[74,76],"end,":[75],"paper":[77],"proposes":[78],"a":[79,105,112,128,142,163,186,217,250],"collaborative":[80],"optimization":[81],"framework":[82],"based":[83,131],"cross-modal":[85,139],"semantic":[86,124,168],"alignment":[87,140],"that":[88,233],"deeply":[89],"integrates":[90],"saliency":[92],"awareness":[93],"CAM":[95,179],"with":[96,122],"refined":[98,161],"representation":[99],"capabilities":[100],"FPM.":[102],"It":[103],"introduces":[104],"multimodal":[106],"pretrained":[107],"model":[108,154],"(CLIP)":[109],"construct":[111],"semantic-driven":[113],"WSOL":[114],"paradigm.":[115],"By":[116],"dynamically":[117],"interacting":[118],"CLIP's":[119,138],"text":[120],"embeddings":[121],"image":[126],"categories,":[127],"semantic-enhanced":[129],"FPM":[130],"similarity":[133],"measurement":[134],"is":[135,146,155,183,213],"generated.":[136],"Leveraging":[137],"capabilities,":[141],"targeted":[143],"generation":[144],"scheme":[145],"designed.":[147],"On":[148,170],"one":[150],"hand,":[151,173],"CLIP":[153,182],"frozen":[156],"and":[157,226,240],"its":[158],"are":[160],"through":[162],"decoder":[164],"obtain":[166],"richer":[167],"representations;":[169],"by":[174,181],"knowledge":[176],"distillation,":[177],"generated":[180],"taken":[184],"as":[185],"reference":[187],"benchmark,":[188],"guiding":[189],"network":[191],"learn":[193],"more":[194],"accurate":[195],"target":[196,228,244],"Additionally,":[198],"enhance":[200],"FPM's":[201],"focus":[202],"regions,":[205],"Exponential":[207],"Decay":[208],"Foreground":[209],"Emphasis":[210],"(EDFE)":[211],"module":[212],"designed,":[214],"which":[215],"uses":[216],"differentiated":[218],"excitation":[219],"strategy":[220],"effectively":[222],"suppress":[223],"background":[224],"interference":[225],"highlight":[227],"areas.":[229],"Experimental":[230],"results":[231],"show":[232],"our":[234],"method":[235],"improves":[237],"completeness":[239],"boundary":[241],"accuracy":[242],"localization":[245],"under":[246],"weak":[247],"supervision,":[248],"laying":[249],"solid":[251],"foundation":[252],"subsequent":[254],"downstream":[255],"tasks.":[256]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-25T00:00:00"}
