{"id":"https://openalex.org/W7133346407","doi":"https://doi.org/10.48550/arxiv.2603.01124","title":"ClinCoT: Clinical-Aware Visual Chain-of-Thought for Medical Vision Language Models","display_name":"ClinCoT: Clinical-Aware Visual Chain-of-Thought for Medical Vision Language Models","publication_year":2026,"publication_date":"2026-03-01","ids":{"openalex":"https://openalex.org/W7133346407","doi":"https://doi.org/10.48550/arxiv.2603.01124"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.01124","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01124","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.01124","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5127928411","display_name":"Xiwei Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liu, Xiwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128025049","display_name":"Yulong Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yulong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127983597","display_name":"Xinlin Zhuang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhuang, Xinlin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127999850","display_name":"Xuhui Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xuhui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123059531","display_name":"Jianxu Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jianxu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128021302","display_name":"Haolin Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Haolin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121288446","display_name":"Imran Razzak","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Razzak, Imran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5127910158","display_name":"Yutong Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Yutong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5127928411"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9283999800682068,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9283999800682068,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.01730000041425228,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.009100000374019146,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.692300021648407},{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.6301000118255615},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.4896000027656555},{"id":"https://openalex.org/keywords/rank","display_name":"Rank (graph theory)","score":0.44909998774528503},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4408000111579895},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.4262999892234802},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.4212999939918518}],"concepts":[{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.692300021648407},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6916000247001648},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.6301000118255615},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6062999963760376},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.4896000027656555},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4887999892234802},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.44909998774528503},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4408000111579895},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.4262999892234802},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.4212999939918518},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.4047999978065491},{"id":"https://openalex.org/C181204326","wikidata":"https://www.wikidata.org/wiki/Q7239820","display_name":"Preference learning","level":3,"score":0.33219999074935913},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32850000262260437},{"id":"https://openalex.org/C86037889","wikidata":"https://www.wikidata.org/wiki/Q4330127","display_name":"Learning to rank","level":3,"score":0.3133000135421753},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.30809998512268066},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3010999858379364},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2858999967575073},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2736999988555908},{"id":"https://openalex.org/C2779974597","wikidata":"https://www.wikidata.org/wiki/Q28448986","display_name":"Clinical Practice","level":2,"score":0.2630000114440918},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.26179999113082886}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.01124","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01124","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.01124","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01124","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.6788908839225769,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Medical":[0],"Vision-Language":[1],"Models":[2],"have":[3],"shown":[4],"promising":[5],"potential":[6],"in":[7,22],"clinical":[8,65],"decision":[9],"support,":[10],"yet":[11],"they":[12],"remain":[13],"prone":[14],"to":[15,19,48,87,116,125,146],"factual":[16,187],"hallucinations":[17],"due":[18],"insufficient":[20],"grounding":[21,188],"localized":[23],"pathological":[24],"evidence.":[25],"Existing":[26],"medical":[27,176],"alignment":[28,153,197],"methods":[29],"primarily":[30],"operate":[31],"at":[32],"the":[33,127,155],"response":[34],"level":[35],"through":[36,103],"preference":[37,82,101,141,170],"optimization,":[38],"improving":[39],"output":[40],"correctness":[41],"but":[42],"leaving":[43],"intermediate":[44],"reasoning":[45,104,149],"weakly":[46],"connected":[47],"visual":[49,66,77],"regions.":[50],"Although":[51],"chain-of-thought":[52,78],"(CoT)":[53],"enhances":[54],"multimodal":[55],"reasoning,":[56],"it":[57],"remains":[58],"largely":[59],"text-centric,":[60],"limiting":[61],"effective":[62],"integration":[63],"of":[64],"cues.":[67],"To":[68,151],"address":[69],"this":[70],"gap,":[71],"we":[72,161],"propose":[73],"ClinCoT,":[74],"a":[75,133],"clinical-aware":[76],"framework":[79],"that":[80,97,138,167,183],"transforms":[81],"optimization":[83,136],"from":[84],"response-level":[85],"correction":[86],"visual-driven":[88],"reasoning.":[89],"We":[90,130],"introduce":[91,132],"an":[92,163],"automatic":[93],"data":[94],"generation":[95,180],"pipeline":[96],"constructs":[98],"clinically":[99],"grounded":[100],"pairs":[102],"with":[105,194],"hypotheses-driven":[106],"region":[107],"proposals.":[108],"Multiple":[109],"Med-LLMs":[110],"evaluators":[111],"rank":[112],"and":[113,119,143,178,189],"assign":[114],"scores":[115],"each":[117],"response,":[118],"these":[120],"rankings":[121],"serve":[122],"as":[123,154],"supervision":[124],"train":[126],"target":[128],"model.":[129],"further":[131],"scoring-based":[134],"margin-aware":[135],"strategy":[137],"incorporates":[139],"both":[140],"ranking":[142],"score":[144],"difference":[145],"refine":[147],"region-level":[148],"trajectories.":[150],"maintain":[152],"model's":[156],"policy":[157],"evolves":[158],"during":[159],"training,":[160],"adopt":[162],"iterative":[164],"learning":[165],"scheme":[166],"dynamically":[168],"regenerates":[169],"data.":[171],"Extensive":[172],"experiments":[173],"on":[174],"three":[175],"VQA":[177],"report":[179],"benchmarks":[181],"demonstrate":[182],"ClinCoT":[184],"consistently":[185],"improves":[186],"achieves":[190],"superior":[191],"performance":[192],"compared":[193],"existing":[195],"preference-based":[196],"methods.":[198]},"counts_by_year":[],"updated_date":"2026-03-04T07:09:34.246503","created_date":"2026-03-04T00:00:00"}
