{"id":"https://openalex.org/W4415400662","doi":"https://doi.org/10.1145/3728424.3760764","title":"Vision-Language Models for Automatic Captioning and Cross-Modal Retrieval","display_name":"Vision-Language Models for Automatic Captioning and Cross-Modal Retrieval","publication_year":2025,"publication_date":"2025-10-21","ids":{"openalex":"https://openalex.org/W4415400662","doi":"https://doi.org/10.1145/3728424.3760764"},"language":null,"primary_location":{"id":"doi:10.1145/3728424.3760764","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3728424.3760764","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2nd International Workshop on Multimedia Computing for Health and Medicine","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3728424.3760764","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120084559","display_name":"Ikram Ounadi","orcid":null},"institutions":[{"id":"https://openalex.org/I4210166741","display_name":"University of Klagenfurt","ror":"https://ror.org/05q9m0937","country_code":"AT","type":"education","lineage":["https://openalex.org/I4210166741"]}],"countries":["AT"],"is_corresponding":true,"raw_author_name":"Ikram Ounadi","raw_affiliation_strings":["University of Klagenfurt, Klagenfurt, Austria"],"raw_orcid":"https://orcid.org/0009-0001-1111-6813","affiliations":[{"raw_affiliation_string":"University of Klagenfurt, Klagenfurt, Austria","institution_ids":["https://openalex.org/I4210166741"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5020123717","display_name":"Klaus Schoeffmann","orcid":"https://orcid.org/0000-0002-9218-1704"},"institutions":[{"id":"https://openalex.org/I4210166741","display_name":"University of Klagenfurt","ror":"https://ror.org/05q9m0937","country_code":"AT","type":"education","lineage":["https://openalex.org/I4210166741"]}],"countries":["AT"],"is_corresponding":false,"raw_author_name":"Klaus Schoeffmann","raw_affiliation_strings":["University of Klagenfurt, Klagenfurt, Austria"],"raw_orcid":"https://orcid.org/0000-0002-9218-1704","affiliations":[{"raw_affiliation_string":"University of Klagenfurt, Klagenfurt, Austria","institution_ids":["https://openalex.org/I4210166741"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5120084559"],"corresponding_institution_ids":["https://openalex.org/I4210166741"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.27957522,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"52","last_page":"56"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.989300012588501,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9854000210762024,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.8708999752998352},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.8065000176429749},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.671500027179718},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5078999996185303},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4657999873161316},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.3621000051498413}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.8708999752998352},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.8065000176429749},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7386999726295471},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.671500027179718},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5695000290870667},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5078999996185303},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4657999873161316},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.3621000051498413},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.34630000591278076},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.29600000381469727},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.26930001378059387},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.26570001244544983},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2535000145435333},{"id":"https://openalex.org/C3019831412","wikidata":"https://www.wikidata.org/wiki/Q5778278","display_name":"Fully automatic","level":2,"score":0.2508000135421753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3728424.3760764","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3728424.3760764","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2nd International Workshop on Multimedia Computing for Health and Medicine","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3728424.3760764","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3728424.3760764","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2nd International Workshop on Multimedia Computing for Health and Medicine","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":7,"referenced_works":["https://openalex.org/W1895577753","https://openalex.org/W2612690371","https://openalex.org/W2884036902","https://openalex.org/W3201906559","https://openalex.org/W4213023287","https://openalex.org/W4386075916","https://openalex.org/W4400024497"],"related_works":[],"abstract_inverted_index":{"Artificial":[0],"intelligence":[1],"is":[2,106],"increasingly":[3],"used":[4],"in":[5,46,95,113],"medicine":[6],"to":[7,27],"support":[8,123],"diagnosis,":[9],"documentation,":[10],"and":[11,16,63,83,126],"surgical":[12,52,69,100],"training.":[13],"In":[14],"ophthalmology":[15],"cataract":[17,47],"surgery,":[18,115],"large":[19],"volumes":[20],"of":[21,30,51,60,92,99],"video":[22,56],"data":[23],"remain":[24],"underutilized":[25],"due":[26],"the":[28,90,97,107,117],"lack":[29],"automated":[31],"analysis":[32],"tools.":[33],"This":[34],"work":[35],"proposes":[36],"a":[37,72],"unified":[38],"multi-modal":[39],"pipeline":[40],"that":[41,122],"addresses":[42],"three":[43],"key":[44],"tasks":[45,112],"surgery:":[48],"(1)":[49],"classification":[50],"phases":[53],"from":[54],"individual":[55],"frames,":[57],"(2)":[58],"retrieval":[59],"cross-modal":[61],"image-text,":[62],"(3)":[64],"automatic":[65],"caption":[66],"generation":[67],"for":[68,75,81,85,119],"images.":[70],"Using":[71],"fine-tuned":[73],"ResNet50":[74],"classification,":[76],"an":[77],"adapted":[78],"CLIP":[79],"model":[80],"retrieval,":[82],"BLIP":[84],"generative":[86],"captioning,":[87],"we":[88],"demonstrate":[89],"potential":[91],"vision-language":[93],"models":[94],"enhancing":[96],"interpretability":[98],"data.":[101],"To":[102],"our":[103],"knowledge,":[104],"this":[105],"first":[108],"study":[109],"combining":[110],"these":[111],"ophthalmic":[114],"laying":[116],"foundation":[118],"intelligent":[120],"systems":[121],"clinical":[124],"workflows":[125],"medical":[127],"education.":[128]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-22T00:00:00"}
