{"id":"https://openalex.org/W7164857192","doi":"https://doi.org/10.1145/3805622.3819063","title":"From Captioning to Multimodal Reasoning: The Evolution of Vision-Language Research in the Era of Multimodal LLMs","display_name":"From Captioning to Multimodal Reasoning: The Evolution of Vision-Language Research in the Era of Multimodal LLMs","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164857192","doi":"https://doi.org/10.1145/3805622.3819063"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3819063","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3819063","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3819063","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5066519737","display_name":"Marcella Cornia","orcid":"https://orcid.org/0000-0001-9640-9385"},"institutions":[{"id":"https://openalex.org/I122346577","display_name":"University of Modena and Reggio Emilia","ror":"https://ror.org/02d4c4y02","country_code":"IT","type":"education","lineage":["https://openalex.org/I122346577"]}],"countries":["IT"],"is_corresponding":true,"raw_author_name":"Marcella Cornia","raw_affiliation_strings":["University of Modena and Reggio Emilia, Reggio Emilia, Italy"],"raw_orcid":"https://orcid.org/0000-0001-9640-9385","affiliations":[{"raw_affiliation_string":"University of Modena and Reggio Emilia, Reggio Emilia, Italy","institution_ids":["https://openalex.org/I122346577"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5066519737"],"corresponding_institution_ids":["https://openalex.org/I122346577"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.9399058,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"1"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9930999875068665,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9930999875068665,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11148","display_name":"Language, Metaphor, and Cognition","score":0.0008999999845400453,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.0005000000237487257,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9236999750137329},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5386999845504761},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.5138999819755554},{"id":"https://openalex.org/keywords/multimodal-interaction","display_name":"Multimodal interaction","score":0.4562000036239624},{"id":"https://openalex.org/keywords/multimodality","display_name":"Multimodality","score":0.4171000123023987},{"id":"https://openalex.org/keywords/paradigm-shift","display_name":"Paradigm shift","score":0.38830000162124634},{"id":"https://openalex.org/keywords/trustworthiness","display_name":"Trustworthiness","score":0.33149999380111694}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9236999750137329},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6315000057220459},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5386999845504761},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.5138999819755554},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.4562000036239624},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.4171000123023987},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.39489999413490295},{"id":"https://openalex.org/C43540301","wikidata":"https://www.wikidata.org/wiki/Q689971","display_name":"Paradigm shift","level":2,"score":0.38830000162124634},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37959998846054077},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.34130001068115234},{"id":"https://openalex.org/C153701036","wikidata":"https://www.wikidata.org/wiki/Q659974","display_name":"Trustworthiness","level":2,"score":0.33149999380111694},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.32839998602867126},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.31360000371932983},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.29739999771118164},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.29429998993873596},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2921999990940094},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2842000126838684},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.26989999413490295},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.2605000138282776}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3819063","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3819063","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3819063","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3819063","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7310834527015686,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Over":[0],"the":[1,104,107,180,183,205,215],"past":[2],"decade,":[3],"vision-language":[4,218],"research":[5],"has":[6,65,101],"undergone":[7],"a":[8],"major":[9,206],"evolution.":[10],"Early":[11],"image":[12],"captioning":[13,222],"systems":[14,72,223],"primarily":[15],"focused":[16],"on":[17,29],"generating":[18],"concise":[19],"textual":[20],"descriptions":[21],"of":[22,55,106,188,217],"visual":[23,49,78,166],"content":[24],"through":[25],"task-specific":[26],"architectures":[27,111],"trained":[28],"relatively":[30],"constrained":[31,149],"datasets.":[32],"While":[33],"successful":[34],"in":[35,126,164],"narrow":[36],"settings,":[37],"these":[38,118],"approaches":[39],"provided":[40],"only":[41],"limited":[42],"reasoning":[43,90],"capabilities":[44,91],"and":[45,50,59,87,96,128,175,186,199,208,233],"shallow":[46],"alignment":[47],"between":[48],"linguistic":[51],"representations.":[52],"The":[53],"emergence":[54],"large-scale":[56],"multimodal":[57,71,85,114,170,189,235],"pretraining":[58],"Multimodal":[60],"Large":[61],"Language":[62],"Models":[63],"(MLLMs)":[64],"fundamentally":[66],"reshaped":[67],"this":[68],"landscape.":[69],"Modern":[70],"are":[73,147],"no":[74],"longer":[75],"restricted":[76],"to":[77,224],"description,":[79],"but":[80],"instead":[81],"support":[82],"open-ended":[83],"interaction,":[84],"generation,":[86],"increasingly":[88],"sophisticated":[89],"across":[92],"images,":[93],"text,":[94],"video,":[95],"diverse":[97],"modalities.":[98],"This":[99,201],"transition":[100],"progressively":[102],"shifted":[103],"focus":[105],"field":[108],"from":[109,220],"perception-oriented":[110],"toward":[112,230],"general-purpose":[113],"foundation":[115,190],"models.":[116],"Despite":[117],"advances,":[119],"current":[120],"MLLMs":[121],"still":[122],"exhibit":[123],"significant":[124],"limitations":[125,158],"knowledge-intensive":[127,165],"reasoning-heavy":[129],"scenarios.":[130],"In":[131],"particular,":[132],"tasks":[133],"requiring":[134],"external,":[135],"long-tail,":[136],"or":[137],"continuously":[138],"evolving":[139],"knowledge":[140,173],"remain":[141],"highly":[142],"challenging,":[143],"as":[144],"model":[145,155],"predictions":[146],"often":[148],"by":[150],"information":[151],"implicitly":[152],"encoded":[153],"within":[154],"parameters.":[156],"These":[157],"have":[159,213],"recently":[160],"motivated":[161],"growing":[162],"interest":[163],"question":[167],"answering,":[168],"retrieval-augmented":[169,225],"architectures,":[171],"adaptive":[172],"integration,":[174],"reasoning-aware":[176],"generation":[177],"strategies.":[178],"At":[179],"same":[181],"time,":[182],"increasing":[184],"scale":[185],"complexity":[187],"models":[191],"raise":[192],"important":[193],"challenges":[194],"concerning":[195],"hallucination":[196],"mitigation,":[197],"interpretability,":[198],"trustworthiness.":[200],"talk":[202],"will":[203],"discuss":[204],"conceptual":[207],"technical":[209],"paradigm":[210],"shifts":[211],"that":[212],"shaped":[214],"evolution":[216],"research,":[219],"classical":[221],"MLLMs,":[226],"highlighting":[227],"emerging":[228],"directions":[229],"knowledge-aware,":[231],"reasoning-centric,":[232],"trustworthy":[234],"AI":[236],"systems.":[237]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
