{"id":"https://openalex.org/W4416729942","doi":"https://doi.org/10.1007/s44267-025-00093-y","title":"Large multimodal agents: a survey","display_name":"Large multimodal agents: a survey","publication_year":2025,"publication_date":"2025-11-26","ids":{"openalex":"https://openalex.org/W4416729942","doi":"https://doi.org/10.1007/s44267-025-00093-y"},"language":"en","primary_location":{"id":"doi:10.1007/s44267-025-00093-y","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s44267-025-00093-y","pdf_url":"https://link.springer.com/content/pdf/10.1007/s44267-025-00093-y.pdf","source":{"id":"https://openalex.org/S4387289164","display_name":"Visual Intelligence","issn_l":"2731-9008","issn":["2731-9008"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Visual Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://link.springer.com/content/pdf/10.1007/s44267-025-00093-y.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5111603816","display_name":"Junlin Xie","orcid":null},"institutions":[{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junlin Xie","raw_affiliation_strings":["Shenzhen Research Institute of Big Data, The Chinese University of Hong Kong, Guangdong, 518172, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen Research Institute of Big Data, The Chinese University of Hong Kong, Guangdong, 518172, China","institution_ids":["https://openalex.org/I4210116924"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100430139","display_name":"Zhihong Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhihong Chen","raw_affiliation_strings":["Shenzhen Research Institute of Big Data, The Chinese University of Hong Kong, Guangdong, 518172, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen Research Institute of Big Data, The Chinese University of Hong Kong, Guangdong, 518172, China","institution_ids":["https://openalex.org/I4210116924"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103228070","display_name":"Rui Zhang","orcid":"https://orcid.org/0000-0002-3045-9969"},"institutions":[{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruifei Zhang","raw_affiliation_strings":["Shenzhen Research Institute of Big Data, The Chinese University of Hong Kong, Guangdong, 518172, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen Research Institute of Big Data, The Chinese University of Hong Kong, Guangdong, 518172, China","institution_ids":["https://openalex.org/I4210116924"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5042965510","display_name":"Guanbin Li","orcid":"https://orcid.org/0000-0002-4805-0926"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Guanbin Li","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, 510275, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, 510275, China","institution_ids":["https://openalex.org/I157773358"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5042965510"],"corresponding_institution_ids":["https://openalex.org/I157773358"],"apc_list":null,"apc_paid":null,"fwci":22.2151,"has_fulltext":true,"cited_by_count":9,"citation_normalized_percentile":{"value":0.99299591,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":"3","issue":"1","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.33070001006126404,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.33070001006126404,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.25929999351501465,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.0820000022649765,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.5548999905586243},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.4447999894618988},{"id":"https://openalex.org/keywords/categorization","display_name":"Categorization","score":0.4041999876499176},{"id":"https://openalex.org/keywords/interpretation","display_name":"Interpretation (philosophy)","score":0.382999986410141},{"id":"https://openalex.org/keywords/multimodality","display_name":"Multimodality","score":0.3522000014781952},{"id":"https://openalex.org/keywords/conceptual-framework","display_name":"Conceptual framework","score":0.32659998536109924}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6664999723434448},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.5548999905586243},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.4781999886035919},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.4447999894618988},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.4041999876499176},{"id":"https://openalex.org/C527412718","wikidata":"https://www.wikidata.org/wiki/Q855395","display_name":"Interpretation (philosophy)","level":2,"score":0.382999986410141},{"id":"https://openalex.org/C539667460","wikidata":"https://www.wikidata.org/wiki/Q2414942","display_name":"Management science","level":1,"score":0.37720000743865967},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.3522000014781952},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3328999876976013},{"id":"https://openalex.org/C14224292","wikidata":"https://www.wikidata.org/wiki/Q13600188","display_name":"Conceptual framework","level":2,"score":0.32659998536109924},{"id":"https://openalex.org/C189708586","wikidata":"https://www.wikidata.org/wiki/Q1504425","display_name":"Systematic review","level":3,"score":0.3239000141620636},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3165999948978424},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.3028999865055084},{"id":"https://openalex.org/C2778012447","wikidata":"https://www.wikidata.org/wiki/Q1034415","display_name":"Scope (computer science)","level":2,"score":0.2806999981403351},{"id":"https://openalex.org/C2778464652","wikidata":"https://www.wikidata.org/wiki/Q309849","display_name":"Open research","level":2,"score":0.27630001306533813},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.2563000023365021},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.25220000743865967}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1007/s44267-025-00093-y","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s44267-025-00093-y","pdf_url":"https://link.springer.com/content/pdf/10.1007/s44267-025-00093-y.pdf","source":{"id":"https://openalex.org/S4387289164","display_name":"Visual Intelligence","issn_l":"2731-9008","issn":["2731-9008"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Visual Intelligence","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:4baa8dce3d8c4758b76001b3d173fbcd","is_oa":true,"landing_page_url":"https://doaj.org/article/4baa8dce3d8c4758b76001b3d173fbcd","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Visual Intelligence, Vol 3, Iss 1, Pp 1-16 (2025)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1007/s44267-025-00093-y","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s44267-025-00093-y","pdf_url":"https://link.springer.com/content/pdf/10.1007/s44267-025-00093-y.pdf","source":{"id":"https://openalex.org/S4387289164","display_name":"Visual Intelligence","issn_l":"2731-9008","issn":["2731-9008"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Visual Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1121271761","display_name":null,"funder_award_id":"Program","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G391238517","display_name":null,"funder_award_id":", and","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4751043524","display_name":null,"funder_award_id":"62322608","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320316083","display_name":"Tencent","ror":"https://ror.org/00hhjss72"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4416729942.pdf"},"referenced_works_count":17,"referenced_works":["https://openalex.org/W2052791563","https://openalex.org/W2124344619","https://openalex.org/W3030037806","https://openalex.org/W4389520806","https://openalex.org/W4389523849","https://openalex.org/W4389781562","https://openalex.org/W4393065402","https://openalex.org/W4400382213","https://openalex.org/W4401043345","https://openalex.org/W4402727710","https://openalex.org/W4405014122","https://openalex.org/W4405056274","https://openalex.org/W4406800520","https://openalex.org/W4411119774","https://openalex.org/W4411458549","https://openalex.org/W4412886962","https://openalex.org/W4412944561"],"related_works":[],"abstract_inverted_index":{"Abstract":[0],"Large":[1],"language":[2],"models":[3],"(LLMs)":[4],"have":[5],"achieved":[6],"superior":[7],"performance":[8],"in":[9,98,133,204],"powering":[10],"text-based":[11],"AI":[12,40,54],"agents,":[13,79],"endowing":[14],"them":[15],"with":[16,121],"decision-making":[17],"and":[18,51,65,100,158,186,199],"reasoning":[19],"abilities":[20],"that":[21,117],"are":[22],"analogous":[23],"to":[24,56,83,163,170,195],"those":[25],"exhibited":[26],"by":[27],"humans.":[28],"Concurrently,":[29],"an":[30],"emerging":[31],"research":[32,106,190,203],"trend":[33],"is":[34,136],"focused":[35],"on":[36],"extending":[37],"these":[38,155],"LLM-powered":[39],"agents":[41,55,87],"into":[42,107],"the":[43,49,94,102,114,122,130,137,165,182],"multimodal":[44,58,78,86],"domain.":[45],"This":[46,167],"extension":[47],"facilitates":[48],"interpretation":[50],"response":[52],"of":[53,76,105,124,129,185],"diverse":[57,138],"user":[59],"queries,":[60],"thereby":[61],"handling":[62],"more":[63,174],"intricate":[64],"nuanced":[66],"tasks.":[67],"In":[68],"this":[69,134,205],"paper,":[70],"we":[71,81,92,112,153,180],"conduct":[72],"a":[73,160],"systematic":[74],"review":[75,113],"LLM-driven":[77],"which":[80,145],"refer":[82],"as":[84],"large":[85],"(":[88],"for":[89,201],"short).":[90],"First,":[91],"introduce":[93],"essential":[95],"components":[96],"involved":[97],"developing":[99],"categorize":[101],"current":[103],"body":[104],"four":[108],"distinct":[109],"types.":[110],"Subsequently,":[111],"collaborative":[115],"frameworks":[116],"integrate":[118],"multiple":[119],",":[120],"aim":[123],"enhancing":[125],"collective":[126],"efficacy.":[127],"One":[128],"critical":[131],"challenges":[132],"field":[135],"evaluation":[139,156],"methods":[140],"used":[141],"across":[142],"existing":[143],"studies,":[144],"impedes":[146],"effective":[147],"comparison":[148],"among":[149],"different":[150],".":[151],"Therefore,":[152],"compile":[154],"methodologies":[157],"establish":[159],"comprehensive":[161],"framework":[162,168],"bridge":[164],"gaps.":[166],"aims":[169,194],"standardize":[171],"evaluations,":[172],"facilitating":[173],"meaningful":[175],"comparisons.":[176],"Concluding":[177],"our":[178],"review,":[179],"highlight":[181],"extensive":[183],"applications":[184],"propose":[187],"potential":[188],"future":[189,202],"directions.":[191],"Our":[192],"discussion":[193],"provide":[196],"valuable":[197],"insights":[198],"guidelines":[200],"rapidly":[206],"evolving":[207],"field.":[208]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":6}],"updated_date":"2026-03-31T07:56:22.981413","created_date":"2025-11-27T00:00:00"}
