{"id":"https://openalex.org/W4389314667","doi":"https://doi.org/10.1145/3627377.3627442","title":"Enhancing Multimodal Understanding with CLIP-Based Image-to-Text Transformation","display_name":"Enhancing Multimodal Understanding with CLIP-Based Image-to-Text Transformation","publication_year":2023,"publication_date":"2023-09-22","ids":{"openalex":"https://openalex.org/W4389314667","doi":"https://doi.org/10.1145/3627377.3627442"},"language":"en","primary_location":{"id":"doi:10.1145/3627377.3627442","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3627377.3627442","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2023 6th International Conference on Big Data Technologies","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102706631","display_name":"Chang Che","orcid":"https://orcid.org/0009-0004-0124-1452"},"institutions":[{"id":"https://openalex.org/I193531525","display_name":"George Washington University","ror":"https://ror.org/00y4zzh67","country_code":"US","type":"education","lineage":["https://openalex.org/I193531525"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Chang Che","raw_affiliation_strings":["The George Washington University, USA"],"raw_orcid":"https://orcid.org/0009-0004-0124-1452","affiliations":[{"raw_affiliation_string":"The George Washington University, USA","institution_ids":["https://openalex.org/I193531525"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103221704","display_name":"Qunwei Lin","orcid":"https://orcid.org/0009-0008-7716-7761"},"institutions":[{"id":"https://openalex.org/I165075387","display_name":"Trine University","ror":"https://ror.org/038e0dv78","country_code":"US","type":"education","lineage":["https://openalex.org/I165075387"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Qunwei Lin","raw_affiliation_strings":["Trine University, USA"],"raw_orcid":"https://orcid.org/0009-0008-7716-7761","affiliations":[{"raw_affiliation_string":"Trine University, USA","institution_ids":["https://openalex.org/I165075387"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101769971","display_name":"Xinyu Zhao","orcid":"https://orcid.org/0009-0006-0086-7554"},"institutions":[{"id":"https://openalex.org/I165075387","display_name":"Trine University","ror":"https://ror.org/038e0dv78","country_code":"US","type":"education","lineage":["https://openalex.org/I165075387"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xinyu Zhao","raw_affiliation_strings":["Trine University, USA"],"raw_orcid":"https://orcid.org/0009-0006-0086-7554","affiliations":[{"raw_affiliation_string":"Trine University, USA","institution_ids":["https://openalex.org/I165075387"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058212759","display_name":"Jiaxin Huang","orcid":"https://orcid.org/0009-0005-0291-6092"},"institutions":[{"id":"https://openalex.org/I165075387","display_name":"Trine University","ror":"https://ror.org/038e0dv78","country_code":"US","type":"education","lineage":["https://openalex.org/I165075387"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jiaxin Huang","raw_affiliation_strings":["Trine University, USA"],"raw_orcid":"https://orcid.org/0009-0005-0291-6092","affiliations":[{"raw_affiliation_string":"Trine University, USA","institution_ids":["https://openalex.org/I165075387"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5061018627","display_name":"Liqiang Yu","orcid":"https://orcid.org/0009-0003-4243-1511"},"institutions":[{"id":"https://openalex.org/I40347166","display_name":"University of Chicago","ror":"https://ror.org/024mw5h28","country_code":"US","type":"education","lineage":["https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Liqiang Yu","raw_affiliation_strings":["The University of Chicago, USA"],"raw_orcid":"https://orcid.org/0009-0003-4243-1511","affiliations":[{"raw_affiliation_string":"The University of Chicago, USA","institution_ids":["https://openalex.org/I40347166"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5102706631"],"corresponding_institution_ids":["https://openalex.org/I193531525"],"apc_list":null,"apc_paid":null,"fwci":5.4435,"has_fulltext":false,"cited_by_count":47,"citation_normalized_percentile":{"value":0.97059092,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"414","last_page":"418"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9887999892234802,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9789999723434448,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.786733865737915},{"id":"https://openalex.org/keywords/transformation","display_name":"Transformation (genetics)","score":0.6450293064117432},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6217401027679443},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5939028263092041},{"id":"https://openalex.org/keywords/ensemble-learning","display_name":"Ensemble learning","score":0.5304678678512573},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.4976084530353546},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.4798690378665924},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.44979429244995117},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.446433424949646},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3776401877403259},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3565324544906616},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3487839698791504},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3203948438167572}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.786733865737915},{"id":"https://openalex.org/C204241405","wikidata":"https://www.wikidata.org/wiki/Q461499","display_name":"Transformation (genetics)","level":3,"score":0.6450293064117432},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6217401027679443},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5939028263092041},{"id":"https://openalex.org/C45942800","wikidata":"https://www.wikidata.org/wiki/Q245652","display_name":"Ensemble learning","level":2,"score":0.5304678678512573},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.4976084530353546},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.4798690378665924},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.44979429244995117},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.446433424949646},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3776401877403259},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3565324544906616},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3487839698791504},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3203948438167572},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3627377.3627442","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3627377.3627442","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2023 6th International Conference on Big Data Technologies","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.7799999713897705,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":11,"referenced_works":["https://openalex.org/W1905882502","https://openalex.org/W1956340063","https://openalex.org/W2088252378","https://openalex.org/W2334125269","https://openalex.org/W2596142952","https://openalex.org/W2754927243","https://openalex.org/W2886641317","https://openalex.org/W2963084599","https://openalex.org/W2963758027","https://openalex.org/W3121523901","https://openalex.org/W3135367836"],"related_works":["https://openalex.org/W2081900870","https://openalex.org/W2037549926","https://openalex.org/W2345479200","https://openalex.org/W2183306018","https://openalex.org/W2849310602","https://openalex.org/W3006008237","https://openalex.org/W2419146053","https://openalex.org/W4388890789","https://openalex.org/W2088247287","https://openalex.org/W2963903416"],"abstract_inverted_index":{"The":[0,70,96],"process":[1],"of":[2,20,40,53,165,189],"transforming":[3],"input":[4],"images":[5,93],"into":[6],"corresponding":[7],"textual":[8],"explanations":[9],"stands":[10],"as":[11],"a":[12,113],"crucial":[13],"and":[14,23,94,155],"complex":[15],"endeavor":[16],"within":[17,65,132],"the":[18,38,54,66,122,133,144,150,163,178,186],"domains":[19],"computer":[21],"vision":[22],"natural":[24],"language":[25],"processing.":[26],"In":[27],"this":[28,119],"paper,":[29],"we":[30],"propose":[31],"an":[32,74],"innovative":[33],"ensemble":[34,47,138,167,190],"approach":[35,139],"that":[36],"harnesses":[37],"capabilities":[39],"Contrastive":[41],"Language-Image":[42],"Pretraining":[43],"(CLIP)":[44],"models.":[45,172],"Our":[46],"framework":[48],"encompasses":[49],"two":[50],"significant":[51],"variations":[52],"CLIP":[55,171],"model,":[56],"each":[57],"meticulously":[58],"designed":[59],"to":[60,62,106,148],"cater":[61],"specific":[63],"nuances":[64],"image-to-text":[67,125,181],"transformation":[68,126,182],"landscape.":[69],"first":[71],"model":[72,98,123],"introduces":[73],"elaborated":[75],"architecture,":[76],"featuring":[77],"multiple":[78],"layers":[79],"with":[80],"distinct":[81],"learning":[82,104,191],"rates,":[83],"thereby":[84],"amplifying":[85],"its":[86],"adeptness":[87],"in":[88,180,192],"capturing":[89],"intricate":[90,195],"relationships":[91],"between":[92,152],"text.":[95],"second":[97],"strategically":[99],"exploits":[100],"CLIP\u2019s":[101],"inherent":[102],"zero-shot":[103],"potential":[105],"generate":[107],"image-text":[108],"embeddings,":[109],"subsequently":[110],"harnessed":[111],"by":[112,127],"K-Nearest":[114],"Neighbors":[115],"(KNN)":[116],"model.":[117],"Through":[118],"KNN-based":[120],"paradigm,":[121],"facilitates":[124],"identifying":[128],"closely":[129],"related":[130],"embeddings":[131,154],"embedding":[134],"space.":[135],"Notably,":[136],"our":[137,166],"is":[140],"rigorously":[141],"evaluated,":[142],"employing":[143],"cosine":[145],"similarity":[146],"metric":[147],"gauge":[149],"alignment":[151],"model-generated":[153],"ground":[156],"truth":[157],"representations.":[158],"Comparative":[159],"experiments":[160],"vividly":[161],"highlight":[162],"superiority":[164],"strategy":[168],"over":[169],"standalone":[170],"This":[173],"study":[174],"not":[175],"only":[176],"advances":[177],"state-of-the-art":[179],"but":[183],"also":[184],"accentuates":[185],"promising":[187],"trajectory":[188],"effectively":[193],"addressing":[194],"multimodal":[196],"tasks.":[197]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":9},{"year":2024,"cited_by_count":28},{"year":2023,"cited_by_count":7}],"updated_date":"2026-05-06T08:25:59.206177","created_date":"2025-10-10T00:00:00"}
