{"id":"https://openalex.org/W4413145734","doi":"https://doi.org/10.1109/cvpr52734.2025.01378","title":"VILA-M3: Enhancing Vision-Language Models with Medical Expert Knowledge","display_name":"VILA-M3: Enhancing Vision-Language Models with Medical Expert Knowledge","publication_year":2025,"publication_date":"2025-06-10","ids":{"openalex":"https://openalex.org/W4413145734","doi":"https://doi.org/10.1109/cvpr52734.2025.01378"},"language":"en","primary_location":{"id":"doi:10.1109/cvpr52734.2025.01378","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52734.2025.01378","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5042877366","display_name":"Vishwesh Nath","orcid":"https://orcid.org/0000-0002-6840-6205"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Vishwesh Nath","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100461989","display_name":"Wenqi Li","orcid":"https://orcid.org/0000-0003-1081-2830"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wenqi Li","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031003824","display_name":"Dong Yang","orcid":"https://orcid.org/0000-0002-5031-4337"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong Yang","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002997328","display_name":"Andriy Myronenko","orcid":"https://orcid.org/0000-0001-8713-7031"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Andriy Myronenko","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100536603","display_name":"Mingxin Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mingxin Zheng","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100594691","display_name":"Yao Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao Lu","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065136413","display_name":"Zhijian Liu","orcid":"https://orcid.org/0009-0007-3905-9893"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhijian Liu","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108868459","display_name":"Hongxu Yin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hongxu Yin","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109672976","display_name":"Y.M. Law","orcid":null},"institutions":[{"id":"https://openalex.org/I157582758","display_name":"SingHealth","ror":"https://ror.org/04me94w47","country_code":"SG","type":"funder","lineage":["https://openalex.org/I157582758"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Yee Man Law","raw_affiliation_strings":["SingHealth"],"affiliations":[{"raw_affiliation_string":"SingHealth","institution_ids":["https://openalex.org/I157582758"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037478865","display_name":"Yucheng Tang","orcid":"https://orcid.org/0000-0002-6008-9700"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yucheng Tang","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051357970","display_name":"Pengfei Guo","orcid":"https://orcid.org/0009-0007-2561-4091"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pengfei Guo","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070998415","display_name":"Can Zhao","orcid":"https://orcid.org/0000-0001-7286-3452"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Can Zhao","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039608149","display_name":"Ziyue Xu","orcid":"https://orcid.org/0000-0002-5728-6869"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ziyue Xu","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102852563","display_name":"Yufan He","orcid":"https://orcid.org/0000-0003-4095-9104"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yufan He","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058176319","display_name":"Stephanie A. Harmon","orcid":"https://orcid.org/0000-0002-2507-2399"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stephanie Harmon","raw_affiliation_strings":["NIH"],"affiliations":[{"raw_affiliation_string":"NIH","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072480361","display_name":"Benjamin Simon","orcid":"https://orcid.org/0000-0002-8658-9711"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Benjamin Simon","raw_affiliation_strings":["NIH"],"affiliations":[{"raw_affiliation_string":"NIH","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030133314","display_name":"Greg Heinrich","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Greg Heinrich","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001892061","display_name":"Stephen Aylward","orcid":"https://orcid.org/0000-0002-7862-8856"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stephen Aylward","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066969226","display_name":"Marc Edgar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Marc Edgar","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063626402","display_name":"Michael Zephyr","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Michael Zephyr","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066945976","display_name":"Pavlo Molchanov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pavlo Molchanov","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022455023","display_name":"Bar\u0131\u015f T\u00fcrkbey","orcid":"https://orcid.org/0000-0003-0853-6494"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Baris Turkbey","raw_affiliation_strings":["NIH"],"affiliations":[{"raw_affiliation_string":"NIH","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043710204","display_name":"Holger R. Roth","orcid":"https://orcid.org/0000-0002-3662-8743"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Holger Roth","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5052339079","display_name":"Daguang Xu","orcid":"https://orcid.org/0000-0002-4621-881X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Daguang Xu","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":24,"corresponding_author_ids":["https://openalex.org/A5042877366"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":13.8558,"has_fulltext":false,"cited_by_count":10,"citation_normalized_percentile":{"value":0.98971611,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"14788","last_page":"14798"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9484000205993652,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9484000205993652,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12576","display_name":"vaccines and immunoinformatics approaches","score":0.9476000070571899,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7347777485847473},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5109909772872925},{"id":"https://openalex.org/keywords/medical-knowledge","display_name":"Medical knowledge","score":0.4887849986553192},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.48287108540534973},{"id":"https://openalex.org/keywords/medicine","display_name":"Medicine","score":0.08921778202056885},{"id":"https://openalex.org/keywords/medical-education","display_name":"Medical education","score":0.07847079634666443}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7347777485847473},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5109909772872925},{"id":"https://openalex.org/C2985722590","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medical knowledge","level":2,"score":0.4887849986553192},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48287108540534973},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.08921778202056885},{"id":"https://openalex.org/C509550671","wikidata":"https://www.wikidata.org/wiki/Q126945","display_name":"Medical education","level":1,"score":0.07847079634666443}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cvpr52734.2025.01378","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52734.2025.01378","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Generalist":[0],"vision":[1,96],"language":[2],"models":[3,29,86,147,263],"(VLMs)":[4],"have":[5],"made":[6],"significant":[7],"strides":[8],"in":[9,16,53,87,93,236,277],"computer":[10],"vision,":[11],"but":[12],"they":[13,155],"fall":[14],"short":[15],"specialized":[17,129],"fields":[18],"like":[19,30],"healthcare,":[20],"where":[21],"expert":[22,63,143,146,207,228],"knowledge":[23,46,205,230],"is":[24,131,231],"essential.":[25],"Current":[26],"large":[27,83],"multimodal":[28],"Gemini":[31],"and":[32,70,100,114,138,167,172,223,260],"GPT-4o":[33],"are":[34,90,152,156,182,216],"insufficient":[35],"for":[36,122,149,159,186,199,219,233,282],"medical":[37,57,123,136,150,179,200,283],"tasks":[38],"due":[39],"to":[40,164,189],"their":[41,68],"reliance":[42],"on":[43,73,135,265],"memorized":[44],"internet":[45],"rather":[47],"than":[48],"the":[49,255,266,272],"nuanced":[50],"expertise":[51,276],"required":[52],"healthcare.":[54,237],"Meanwhile,":[55],"existing":[56],"VLMs":[58,89,201,281],"(e.g.":[59],"Med-Gemini)":[60],"often":[61,183],"lack":[62],"consultation":[64],"as":[65],"part":[66],"of":[67,112,128,178,226,252,274],"design,":[69],"many":[71],"rely":[72],"outdated,":[74],"static":[75],"datasets":[76],"that":[77,121,181,202,211],"were":[78],"not":[79,217],"created":[80],"with":[81,248],"modern,":[82],"deep":[84],"learning":[85],"mind.":[88],"usually":[91],"trained":[92,158,264],"three":[94],"stages:":[95],"pre-training,":[97,99],"vision-language":[98],"instruction":[101],"fine-tuning":[102],"(IFT).":[103],"IFT":[104,130],"has":[105],"been":[106],"typically":[107],"applied":[108],"using":[109],"a":[110,125,187,195],"mixture":[111],"generic":[113,212],"healthcare":[115],"data.":[116],"In":[117],"contrast,":[118],"we":[119,241],"propose":[120],"VLMs,":[124],"fourth":[126],"stage":[127],"necessary,":[132],"which":[133,174],"focuses":[134],"data":[137],"includes":[139],"information":[140],"from":[141],"domain":[142,204,275],"models.":[144,208],"Domain":[145],"developed":[148],"use":[151],"crucial":[153],"because":[154],"specifically":[157],"certain":[160],"clinical":[161,221],"tasks,":[162],"e.g.":[163],"detect":[165],"tumors":[166],"classify":[168],"abnormalities":[169],"through":[170],"segmentation":[171],"classification,":[173],"learn":[175],"fine-grained":[176],"features":[177],"data\u2212features":[180],"too":[184],"intricate":[185],"VLM":[188,213],"capture":[190],"effectively.":[191],"This":[192],"paper":[193],"introduces":[194],"new":[196],"framework,":[197],"VILA-M3,":[198],"utilizes":[203],"via":[206],"We":[209],"argue":[210],"architectures":[214],"alone":[215],"viable":[218],"real-world":[220],"applications":[222],"on-demand":[224],"usage":[225],"domain-specialized":[227],"model":[229,258],"critical":[232],"advancing":[234],"AI":[235],"Through":[238],"our":[239],"experiments,":[240],"show":[242],"an":[243,249],"improved":[244],"state-of-the-art":[245],"(SOTA)":[246],"performance":[247],"average":[250],"improvement":[251],"~9%":[253],"over":[254,262],"prior":[256],"SOTA":[257],"Med-Gemini":[259],"~6%":[261],"specific":[267],"tasks.":[268],"Our":[269],"approach":[270],"emphasizes":[271],"importance":[273],"creating":[278],"precise,":[279],"reliable":[280],"applications.":[284]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":9}],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2025-10-10T00:00:00"}
