{"id":"https://openalex.org/W4403791564","doi":"https://doi.org/10.1145/3664647.3680707","title":"Align2Concept: Language Guided Interpretable Image Recognition by Visual Prototype and Textual Concept Alignment","display_name":"Align2Concept: Language Guided Interpretable Image Recognition by Visual Prototype and Textual Concept Alignment","publication_year":2024,"publication_date":"2024-10-26","ids":{"openalex":"https://openalex.org/W4403791564","doi":"https://doi.org/10.1145/3664647.3680707"},"language":"en","primary_location":{"id":"doi:10.1145/3664647.3680707","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3680707","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5043642600","display_name":"J. Wang","orcid":"https://orcid.org/0009-0000-1766-7774"},"institutions":[{"id":"https://openalex.org/I21193070","display_name":"Beijing Jiaotong University","ror":"https://ror.org/01yj56c84","country_code":"CN","type":"education","lineage":["https://openalex.org/I21193070"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jiaqi Wang","raw_affiliation_strings":["Beijing Jiaotong University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Jiaotong University, Beijing, China","institution_ids":["https://openalex.org/I21193070"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042680345","display_name":"Pichao Wang","orcid":"https://orcid.org/0000-0002-1430-0237"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Pichao Wang","raw_affiliation_strings":["Alibaba Group, Seattle, USA"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Seattle, USA","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100965549","display_name":"Yi Feng","orcid":"https://orcid.org/0000-0002-8358-2995"},"institutions":[{"id":"https://openalex.org/I21193070","display_name":"Beijing Jiaotong University","ror":"https://ror.org/01yj56c84","country_code":"CN","type":"education","lineage":["https://openalex.org/I21193070"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi Feng","raw_affiliation_strings":["Beijing Jiaotong University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Jiaotong University, Beijing, China","institution_ids":["https://openalex.org/I21193070"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100725633","display_name":"Huafeng Liu","orcid":"https://orcid.org/0000-0002-7914-6867"},"institutions":[{"id":"https://openalex.org/I21193070","display_name":"Beijing Jiaotong University","ror":"https://ror.org/01yj56c84","country_code":"CN","type":"education","lineage":["https://openalex.org/I21193070"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huafeng Liu","raw_affiliation_strings":["Beijing Jiaotong University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Jiaotong University, Beijing, China","institution_ids":["https://openalex.org/I21193070"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111336307","display_name":"Chang Gao","orcid":"https://orcid.org/0009-0009-9464-378X"},"institutions":[{"id":"https://openalex.org/I21193070","display_name":"Beijing Jiaotong University","ror":"https://ror.org/01yj56c84","country_code":"CN","type":"education","lineage":["https://openalex.org/I21193070"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chang Gao","raw_affiliation_strings":["Beijing Jiaotong University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Jiaotong University, Beijing, China","institution_ids":["https://openalex.org/I21193070"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5069749738","display_name":"Liping Jing","orcid":"https://orcid.org/0000-0001-7578-3407"},"institutions":[{"id":"https://openalex.org/I21193070","display_name":"Beijing Jiaotong University","ror":"https://ror.org/01yj56c84","country_code":"CN","type":"education","lineage":["https://openalex.org/I21193070"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liping Jing","raw_affiliation_strings":["Beijing Jiaotong University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Jiaotong University, Beijing, China","institution_ids":["https://openalex.org/I21193070"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5043642600"],"corresponding_institution_ids":["https://openalex.org/I21193070"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18500171,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"8972","last_page":"8981"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9940999746322632,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7708969116210938},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6933377385139465},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5228936076164246},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4695627689361572},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.45619896054267883},{"id":"https://openalex.org/keywords/visual-language","display_name":"Visual language","score":0.41483980417251587},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.382531076669693},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.1430376172065735}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7708969116210938},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6933377385139465},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5228936076164246},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4695627689361572},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.45619896054267883},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.41483980417251587},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.382531076669693},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.1430376172065735},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3664647.3680707","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3680707","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":43,"referenced_works":["https://openalex.org/W64698994","https://openalex.org/W1536680647","https://openalex.org/W1538281814","https://openalex.org/W1598140581","https://openalex.org/W1787224781","https://openalex.org/W1849277567","https://openalex.org/W1992817935","https://openalex.org/W2045830397","https://openalex.org/W2086953401","https://openalex.org/W2137492840","https://openalex.org/W2144502914","https://openalex.org/W2153469952","https://openalex.org/W2190008860","https://openalex.org/W2194775991","https://openalex.org/W2195388612","https://openalex.org/W2295107390","https://openalex.org/W2533598788","https://openalex.org/W2559655401","https://openalex.org/W2606462007","https://openalex.org/W2764024122","https://openalex.org/W2776207810","https://openalex.org/W2962784289","https://openalex.org/W2962858109","https://openalex.org/W2963446712","https://openalex.org/W2981852735","https://openalex.org/W2988157455","https://openalex.org/W3004725381","https://openalex.org/W3027723803","https://openalex.org/W3034676907","https://openalex.org/W3081636091","https://openalex.org/W3097858943","https://openalex.org/W3101609372","https://openalex.org/W3106638601","https://openalex.org/W3113107791","https://openalex.org/W3172917901","https://openalex.org/W3175314453","https://openalex.org/W3202988816","https://openalex.org/W3215928733","https://openalex.org/W4221144077","https://openalex.org/W4288089799","https://openalex.org/W4386075969","https://openalex.org/W4386699333","https://openalex.org/W6602657573"],"related_works":["https://openalex.org/W2755342338","https://openalex.org/W2369501117","https://openalex.org/W2779427294","https://openalex.org/W2775347418","https://openalex.org/W2625805835","https://openalex.org/W2079911747","https://openalex.org/W3116076068","https://openalex.org/W3003936178","https://openalex.org/W2145652935","https://openalex.org/W2563206327"],"abstract_inverted_index":{"Most":[0],"works":[1],"of":[2,62,82,136,233],"interpretable":[3,226,240],"neural":[4],"networks":[5],"strive":[6],"for":[7,55],"learning":[8,56,94],"the":[9,30,35,60,66,76,113,118,125,129,133,156,163,170,174,184,204,214,224],"semantics":[10,31],"concepts":[11,25],"merely":[12],"from":[13,26,37,128],"single":[14],"modal":[15],"information":[16],"such":[17],"as":[18,121],"images.":[19],"However,":[20],"humans":[21],"usually":[22],"learn":[23],"semantic":[24],"multiple":[27],"modalities":[28],"and":[29,45,132,206,219],"is":[32],"encoded":[33,138,195],"by":[34,42,117,139,166,196],"brain":[36],"fused":[38],"multi-modal":[39],"information.":[40],"Inspired":[41],"cognitive":[43],"science":[44],"vision-language":[46],"learning,":[47],"we":[48,68,111],"propose":[49],"a":[50,71,87,122,151,231],"Prototype-Concept":[51],"Alignment":[52],"Network":[53],"(ProCoNet)":[54],"visual":[57,72,106,178],"prototypes":[58,107,137,179],"under":[59],"guidance":[61],"textual":[63,102,109,192],"concepts.":[64,103],"In":[65],"ProCoNet,":[67],"have":[69],"designed":[70],"encoder":[73],"to":[74,95,100,187,223],"decompose":[75],"input":[77],"image":[78],"into":[79,161,183],"regional":[80,126],"features":[81,127,194],"prototypes,":[83],"while":[84],"also":[85],"developing":[86],"prompt":[88,96],"generation":[89],"strategy":[90],"that":[91,213],"incorporates":[92],"in-context":[93],"large":[97],"language":[98],"models":[99],"generate":[101],"To":[104],"align":[105,188],"with":[108,189],"concepts,":[110],"leverage":[112],"multimodal":[114,185],"space":[115,131,186],"provided":[116],"pre-trained":[119],"CLIP":[120,140],"bridge.":[123],"Specifically,":[124],"vision":[130],"cropped":[134],"regions":[135],"reside":[141],"on":[142,169,203],"different":[143],"but":[144],"semantically":[145,190],"highly":[146],"correlated":[147],"manifolds,":[148],"i.e.":[149],"follow":[150],"multi-manifold":[152,157],"distribution.":[153],"We":[154,198],"transform":[155,168],"distribution":[158],"alignment":[159],"problem":[160],"optimizing":[162],"projection":[164,176],"matrix":[165],"Cayley":[167],"Stiefel":[171],"manifold.":[172],"Through":[173],"learned":[175],"matrix,":[177],"can":[180],"be":[181],"projected":[182],"similar":[191],"concept":[193],"CLIP.":[197],"conducted":[199],"two":[200],"case":[201],"studies":[202],"CUB-200-2011":[205],"Oxford":[207],"Flower":[208],"dataset.":[209],"Our":[210],"experiments":[211],"show":[212],"ProCoNet":[215,229],"provides":[216],"higher":[217],"accuracy":[218],"better":[220],"interpretability":[221,234],"compared":[222],"single-modality":[225],"model.":[227],"Furthermore,":[228],"offers":[230],"level":[232],"not":[235],"previously":[236],"available":[237],"in":[238],"other":[239],"methods.":[241]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
