{"id":"https://openalex.org/W7133803567","doi":"https://doi.org/10.1016/j.array.2026.100739","title":"A systematic review of vision language models: Comprehensive analysis of architectures, applications, datasets and challenges towards robust multimodal intelligence","display_name":"A systematic review of vision language models: Comprehensive analysis of architectures, applications, datasets and challenges towards robust multimodal intelligence","publication_year":2026,"publication_date":"2026-03-05","ids":{"openalex":"https://openalex.org/W7133803567","doi":"https://doi.org/10.1016/j.array.2026.100739"},"language":"en","primary_location":{"id":"doi:10.1016/j.array.2026.100739","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.array.2026.100739","pdf_url":null,"source":{"id":"https://openalex.org/S4210194039","display_name":"Array","issn_l":"2590-0056","issn":["2590-0056"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Array","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1016/j.array.2026.100739","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100715081","display_name":"Arifur Rahman","orcid":"https://orcid.org/0009-0004-6718-8869"},"institutions":[{"id":"https://openalex.org/I1307585291","display_name":"Khulna University of Engineering and Technology","ror":"https://ror.org/04y58d606","country_code":"BD","type":"education","lineage":["https://openalex.org/I1307585291"]}],"countries":["BD"],"is_corresponding":true,"raw_author_name":"Arifur Rahman","raw_affiliation_strings":["Department of Computer Science and Engineering, Khulna University of Engineering and Technology, Khulna 9203, Bangladesh"],"raw_orcid":"https://orcid.org/0009-0004-6718-8869","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, Khulna University of Engineering and Technology, Khulna 9203, Bangladesh","institution_ids":["https://openalex.org/I1307585291"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5100715081"],"corresponding_institution_ids":["https://openalex.org/I1307585291"],"apc_list":{"value":1350,"currency":"USD","value_usd":1350},"apc_paid":{"value":1350,"currency":"USD","value_usd":1350},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.44361455,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"30","issue":null,"first_page":"100739","last_page":"100739"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9150000214576721,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9150000214576721,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.0071000000461936,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.0052999998442828655,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.2856000065803528},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.258899986743927},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.2554999887943268},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.25119999051094055},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.24779999256134033}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6459000110626221},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4803999960422516},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2856000065803528},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.28209999203681946},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.258899986743927},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2554999887943268},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.25119999051094055},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.24779999256134033},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.24289999902248383},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.24269999563694}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1016/j.array.2026.100739","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.array.2026.100739","pdf_url":null,"source":{"id":"https://openalex.org/S4210194039","display_name":"Array","issn_l":"2590-0056","issn":["2590-0056"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Array","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1016/j.array.2026.100739","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.array.2026.100739","pdf_url":null,"source":{"id":"https://openalex.org/S4210194039","display_name":"Array","issn_l":"2590-0056","issn":["2590-0056"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Array","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":69,"referenced_works":["https://openalex.org/W2560730294","https://openalex.org/W2561715562","https://openalex.org/W2963518342","https://openalex.org/W2963644680","https://openalex.org/W2984008963","https://openalex.org/W3035524453","https://openalex.org/W3161801106","https://openalex.org/W3164027444","https://openalex.org/W3172942063","https://openalex.org/W3173220247","https://openalex.org/W3198377975","https://openalex.org/W4312261477","https://openalex.org/W4312310776","https://openalex.org/W4312420092","https://openalex.org/W4312747482","https://openalex.org/W4312784228","https://openalex.org/W4312804044","https://openalex.org/W4312935996","https://openalex.org/W4312956471","https://openalex.org/W4312980231","https://openalex.org/W4313156423","https://openalex.org/W4385570412","https://openalex.org/W4385804789","https://openalex.org/W4386065350","https://openalex.org/W4386071707","https://openalex.org/W4386076483","https://openalex.org/W4386790226","https://openalex.org/W4391654261","https://openalex.org/W4401042569","https://openalex.org/W4402702928","https://openalex.org/W4402713111","https://openalex.org/W4402716070","https://openalex.org/W4402716264","https://openalex.org/W4402716288","https://openalex.org/W4402727451","https://openalex.org/W4402775923","https://openalex.org/W4402776460","https://openalex.org/W4403337227","https://openalex.org/W4404509535","https://openalex.org/W4405382875","https://openalex.org/W4405673884","https://openalex.org/W4407152340","https://openalex.org/W4407743259","https://openalex.org/W4407874080","https://openalex.org/W4408564248","https://openalex.org/W4409383105","https://openalex.org/W4410032048","https://openalex.org/W4410152950","https://openalex.org/W4410636363","https://openalex.org/W4413144616","https://openalex.org/W4413147106","https://openalex.org/W4413147456","https://openalex.org/W4413147465","https://openalex.org/W4413147863","https://openalex.org/W4413156213","https://openalex.org/W4413947048","https://openalex.org/W4414099389","https://openalex.org/W4414197420","https://openalex.org/W4415798457","https://openalex.org/W4415798703","https://openalex.org/W4415800385","https://openalex.org/W4416037023","https://openalex.org/W7117320526","https://openalex.org/W7124471359","https://openalex.org/W7131100025","https://openalex.org/W7133185086","https://openalex.org/W7133188694","https://openalex.org/W7133224204","https://openalex.org/W7133253098"],"related_works":[],"abstract_inverted_index":{"Vision-Language":[0],"Models":[1],"have":[2],"transformed":[3],"multimodal":[4,213,257,336,360],"artificial":[5],"intelligence,":[6],"yet":[7],"a":[8,56,144,173,219,300,350],"comprehensive":[9],"synthesis":[10,348],"of":[11,59,225],"their":[12,85],"architectural":[13,81,240,275],"evolution,":[14],"training":[15,141],"paradigms,":[16],"and":[17,74,113,132,156,184,211,234,243,264,292,306,320,330,338,345,357],"domain-specific":[18],"capabilities":[19],"remains":[20],"limited.":[21],"This":[22,215,347],"systematic":[23],"review,":[24],"conducted":[25],"according":[26],"to":[27,35,122,227,239,271,278,323],"PRISMA":[28],"guidelines,":[29],"analyzes":[30],"research":[31,316],"from":[32],"January":[33],"2021":[34],"December":[36],"2025.":[37],"From":[38],"928":[39],"identified":[40],"records":[41],"across":[42],"seven":[43],"digital":[44],"libraries,":[45],"48":[46],"articles":[47],"were":[48],"retained":[49],"for":[50,100,171,182,198,222,343,353],"final":[51],"synthesis.":[52,76],"The":[53,137,200,310],"review":[54,311],"establishes":[55],"unified":[57,103],"taxonomy":[58],"VLM":[60],"architectures,":[61],"categorizing":[62],"them":[63],"by":[64,313],"core":[65],"functional":[66],"objectives":[67],"including":[68],"vision-language":[69],"understanding,":[70],"vision-conditioned":[71],"text":[72],"generation,":[73],"multimodal-to-multimodal":[75],"These":[77],"are":[78],"organized":[79],"alongside":[80],"families":[82],"defined":[83],"through":[84],"coupling":[86],"mechanisms:":[87],"dual-encoder":[88],"models":[89,105,125],"optimized":[90],"via":[91,126,135,159],"symmetric":[92],"InfoNCE":[93],"contrastive":[94,149,180],"loss;":[95],"fusion-based":[96],"transformers":[97],"employing":[98],"cross-attention":[99],"fine-grained":[101],"grounding;":[102,332],"single-stream":[104],"using":[106],"prefix":[107],"language":[108,124,152],"modeling":[109],"over":[110],"visual":[111],"tokens;":[112],"modular":[114],"bridge":[115],"systems":[116],"that":[117],"connect":[118],"pretrained":[119],"vision":[120],"encoders":[121],"large":[123],"query-based":[127],"adapters":[128],"such":[129],"as":[130,206],"Q-Former":[131],"parameter-efficient":[133],"tuning":[134],"LoRA.":[136],"study":[138,201,248],"consolidates":[139],"disparate":[140],"approaches":[142],"into":[143],"multi-objective":[145],"integration":[146],"framework,":[147,169],"combining":[148],"alignment,":[150],"masked":[151],"or":[153],"image":[154],"modeling,":[155],"reinforcement-based":[157],"alignment":[158],"Group":[160],"Relative":[161],"Policy":[162],"Optimization":[163],"(GRPO).":[164],"Ablation":[165],"studies":[166,296],"validate":[167],"this":[168,247],"showing,":[170],"instance,":[172],"31.7%":[174],"accuracy":[175,191],"drop":[176],"on":[177,192,230],"ScienceQA":[178],"without":[179],"pretraining":[181],"LLaVA-1.5":[183],"an":[185],"18.2%":[186],"decrease":[187],"in":[188,287],"clinical":[189],"report":[190],"MIMIC-CXR":[193],"when":[194],"GRPO":[195],"is":[196],"disabled":[197],"MedVLM-R1.":[199],"formalizes":[202],"the":[203,207,325],"compositionality":[204,326],"gap":[205,302],"KL-divergence":[208],"between":[209,303],"joint":[210],"factorized":[212],"representations.":[214],"diagnostic":[216],"metric":[217],"provides":[218,349],"mathematical":[220],"explanation":[221],"performance":[223,305],"deficits":[224],"40":[226],"65%":[228],"observed":[229],"benchmarks":[231],"like":[232],"GQA":[233],"Winoground,":[235],"linking":[236],"these":[237,269],"failures":[238],"fusion":[241],"bottlenecks":[242],"dataset":[244],"biases.":[245],"Besides,":[246],"also":[249],"explores":[250],"diverse":[251],"vertical":[252],"applications,":[253],"specifically":[254],"targeting":[255],"standard":[256],"interfaces,":[258],"medical":[259],"image-to-text":[260],"reasoning,":[261],"geospatial":[262],"surveillance,":[263],"VLA":[265],"robotics.":[266],"We":[267],"analyzed":[268],"sectors":[270],"determine":[272],"how":[273],"specific":[274],"configurations":[276],"adapt":[277],"specialized":[279],"data":[280],"constraints.":[281],"Evaluation":[282],"consistently":[283],"reveals":[284],"critical":[285],"limitations":[286],"robustness,":[288],"interpretability,":[289],"hallucination":[290],"control,":[291],"out-of-domain":[293],"generalization.":[294],"Most":[295],"remain":[297],"lab-based,":[298],"highlighting":[299],"significant":[301],"benchmark":[304],"real-world,":[307],"safety-critical":[308],"deployment.":[309],"concludes":[312],"charting":[314],"essential":[315],"directions:":[317],"advancing":[318],"neuro-symbolic":[319],"mixture-of-experts":[321],"architectures":[322],"close":[324],"gap;":[327],"developing":[328,354],"spatiotemporal":[329],"multilingual":[331],"implementing":[333],"privacy-aware":[334],"federated":[335],"learning;":[337],"creating":[339],"rigorous":[340],"evaluation":[341],"protocols":[342],"consistency":[344],"safety.":[346],"foundational":[351],"reference":[352],"robust,":[355],"adaptable,":[356],"trustworthy":[358],"next-generation":[359],"systems.":[361]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-06T00:00:00"}
