{"id":"https://openalex.org/W7160956816","doi":"https://doi.org/10.48550/arxiv.2605.08493","title":"CapCLIP: A Vision-Language Representation Alignment Approach for Wireless Capsule Endoscopy Analysis","display_name":"CapCLIP: A Vision-Language Representation Alignment Approach for Wireless Capsule Endoscopy Analysis","publication_year":2026,"publication_date":"2026-05-08","ids":{"openalex":"https://openalex.org/W7160956816","doi":"https://doi.org/10.48550/arxiv.2605.08493"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.08493","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08493","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.08493","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5021585972","display_name":"Haroon Wahab","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wahab, Haroon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039355175","display_name":"Irfan Mehmood","orcid":"https://orcid.org/0000-0001-7864-957X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mehmood, Irfan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5083781406","display_name":"Hassan Ugail","orcid":"https://orcid.org/0000-0002-3084-1797"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ugail, Hassan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11378","display_name":"Gastrointestinal Bleeding Diagnosis and Treatment","score":0.847100019454956,"subfield":{"id":"https://openalex.org/subfields/2715","display_name":"Gastroenterology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T11378","display_name":"Gastrointestinal Bleeding Diagnosis and Treatment","score":0.847100019454956,"subfield":{"id":"https://openalex.org/subfields/2715","display_name":"Gastroenterology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10552","display_name":"Colorectal Cancer Screening and Detection","score":0.031700000166893005,"subfield":{"id":"https://openalex.org/subfields/2730","display_name":"Oncology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10862","display_name":"AI in cancer detection","score":0.01889999955892563,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.8828999996185303},{"id":"https://openalex.org/keywords/capsule-endoscopy","display_name":"Capsule endoscopy","score":0.7788000106811523},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.6287999749183655},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.42480000853538513},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.3456000089645386},{"id":"https://openalex.org/keywords/wireless","display_name":"Wireless","score":0.34119999408721924},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.336899995803833},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.29670000076293945}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.8828999996185303},{"id":"https://openalex.org/C2777333622","wikidata":"https://www.wikidata.org/wiki/Q116753","display_name":"Capsule endoscopy","level":2,"score":0.7788000106811523},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6406999826431274},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6384000182151794},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.6287999749183655},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.42480000853538513},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3831999897956848},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3702000081539154},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.3456000089645386},{"id":"https://openalex.org/C555944384","wikidata":"https://www.wikidata.org/wiki/Q249","display_name":"Wireless","level":2,"score":0.34119999408721924},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.336899995803833},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.30489999055862427},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.29670000076293945},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.29580000042915344},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.28360000252723694},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.2736000120639801},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2669999897480011},{"id":"https://openalex.org/C161301231","wikidata":"https://www.wikidata.org/wiki/Q3478658","display_name":"Knowledge representation and reasoning","level":2,"score":0.2630999982357025},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.2572999894618988},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.2515999972820282}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.08493","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08493","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.08493","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08493","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.7715441584587097}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Wireless":[0],"capsule":[1,79,192],"endoscopy":[2,80,193],"(WCE)":[3],"enables":[4],"non-invasive":[5],"visual":[6],"assessment":[7],"of":[8,22,30,198],"the":[9,19,28,146,196],"small":[10],"bowel,":[11],"but":[12],"its":[13],"clinical":[14],"utility":[15],"is":[16,108],"constrained":[17],"by":[18],"large":[20],"volume":[21],"frames":[23,81],"generated":[24],"per":[25],"examination":[26],"and":[27,53,59,91,103,114,137,157,174,194],"difficulty":[29],"recognising":[31],"subtle":[32],"abnormalities":[33],"under":[34,118],"highly":[35],"variable":[36],"imaging":[37],"conditions.":[38],"Existing":[39],"learning-based":[40],"approaches":[41],"for":[42,75],"WCE":[43,124,178,200],"are":[44,99],"predominantly":[45],"vision-only,":[46],"often":[47],"confined":[48],"to":[49,191],"narrow":[50],"pathology":[51],"sets,":[52],"show":[54],"limited":[55],"transfer":[56],"across":[57],"datasets":[58],"centres.":[60],"To":[61],"address":[62],"these":[63,141],"limitations,":[64],"this":[65],"study":[66],"introduces":[67],"CapCLIP,":[68],"a":[69,185],"domain-specific":[70],"vision-language":[71,115],"representation":[72,168],"learning":[73,96,169],"framework":[74,107],"WCE.":[76],"CapCLIP":[77,143,183],"aligns":[78],"with":[82,149],"clinically":[83],"grounded":[84],"textual":[85],"descriptions":[86],"derived":[87],"from":[88],"standardised":[89],"nomenclature":[90],"pathology-aware":[92],"caption":[93],"templates,":[94],"thereby":[95],"embeddings":[97],"that":[98,166],"both":[100,172],"semantically":[101],"informed":[102],"transferable.":[104],"The":[105,163],"proposed":[106],"evaluated":[109],"against":[110],"relevant":[111],"open-source":[112],"vision":[113],"foundation":[116,188],"models":[117,189],"strict":[119],"zero-shot":[120,154],"conditions":[121],"using":[122],"unseen":[123],"datasets.":[125,162],"Evaluation":[126],"covers":[127],"three":[128],"downstream":[129],"tasks:":[130],"K-nearest":[131],"neighbour":[132],"classification,":[133,136],"CLIP-style":[134],"image-text":[135,155],"text-to-image":[138],"retrieval.":[139],"Across":[140],"settings,":[142],"consistently":[144],"outperforms":[145],"compared":[147],"baselines,":[148],"particularly":[150],"strong":[151],"gains":[152],"in":[153,177],"classification":[156],"cross-modal":[158],"retrieval":[159],"on":[160],"out-of-distribution":[161],"results":[164],"indicate":[165],"language-guided":[167],"can":[170],"improve":[171],"generalisation":[173],"semantic":[175],"interpretability":[176],"analysis.":[179,201],"These":[180],"findings":[181],"position":[182],"as":[184],"step":[186],"toward":[187],"tailored":[190],"support":[195],"use":[197],"language-grounded":[199]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-13T00:00:00"}
