{"id":"https://openalex.org/W7117458516","doi":"https://doi.org/10.1109/dicta68720.2025.11302424","title":"Rethinking Agentic and End-to-End Large Multimodal Models for Vision Tasks","display_name":"Rethinking Agentic and End-to-End Large Multimodal Models for Vision Tasks","publication_year":2025,"publication_date":"2025-12-03","ids":{"openalex":"https://openalex.org/W7117458516","doi":"https://doi.org/10.1109/dicta68720.2025.11302424"},"language":null,"primary_location":{"id":"doi:10.1109/dicta68720.2025.11302424","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dicta68720.2025.11302424","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Digital Image Computing: Techniques and Applications (DICTA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5121482641","display_name":"Yixin Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I5681781","display_name":"University of Adelaide","ror":"https://ror.org/00892tw58","country_code":"AU","type":"education","lineage":["https://openalex.org/I5681781"]}],"countries":["AU"],"is_corresponding":true,"raw_author_name":"Yixin Wang","raw_affiliation_strings":["The University of Adelaide,Adelaide,SA,Australia,5005"],"affiliations":[{"raw_affiliation_string":"The University of Adelaide,Adelaide,SA,Australia,5005","institution_ids":["https://openalex.org/I5681781"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5114310279","display_name":"Xinyu Wang","orcid":"https://orcid.org/0000-0002-1621-5511"},"institutions":[{"id":"https://openalex.org/I5681781","display_name":"University of Adelaide","ror":"https://ror.org/00892tw58","country_code":"AU","type":"education","lineage":["https://openalex.org/I5681781"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Xinyu Wang","raw_affiliation_strings":["The University of Adelaide,Adelaide,SA,Australia,5005"],"affiliations":[{"raw_affiliation_string":"The University of Adelaide,Adelaide,SA,Australia,5005","institution_ids":["https://openalex.org/I5681781"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5121482641"],"corresponding_institution_ids":["https://openalex.org/I5681781"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.65789185,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9829999804496765,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9829999804496765,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.001500000013038516,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.00139999995008111,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5990999937057495},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.5855000019073486},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5501000285148621},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.4291999936103821},{"id":"https://openalex.org/keywords/ranging","display_name":"Ranging","score":0.4146000146865845},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.3944999873638153}],"concepts":[{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.671999990940094},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6686000227928162},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6577000021934509},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5990999937057495},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.5855000019073486},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5501000285148621},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.4291999936103821},{"id":"https://openalex.org/C115051666","wikidata":"https://www.wikidata.org/wiki/Q6522493","display_name":"Ranging","level":2,"score":0.4146000146865845},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.3944999873638153},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.3382999897003174},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.30160000920295715},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.28450000286102295},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.28130000829696655},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2808000147342682},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.2777999937534332},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.27469998598098755}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/dicta68720.2025.11302424","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dicta68720.2025.11302424","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Digital Image Computing: Techniques and Applications (DICTA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W1982925187","https://openalex.org/W2185175083","https://openalex.org/W2251512949","https://openalex.org/W2463631526","https://openalex.org/W2519904008","https://openalex.org/W2560730294","https://openalex.org/W2585635281","https://openalex.org/W2962749469","https://openalex.org/W3015469128","https://openalex.org/W3035644209","https://openalex.org/W3170764266","https://openalex.org/W4226075195","https://openalex.org/W4304091583","https://openalex.org/W4312423884","https://openalex.org/W4386065691","https://openalex.org/W4390872747","https://openalex.org/W4402713111","https://openalex.org/W4403081466","https://openalex.org/W4404612908","https://openalex.org/W7103755574"],"related_works":[],"abstract_inverted_index":{"Large":[0],"Multimodal":[1],"Models":[2],"(LMMs)":[3],"have":[4],"emerged":[5],"as":[6],"powerful":[7],"general-purpose":[8],"vision":[9,51],"systems,":[10],"yet":[11],"their":[12],"effectiveness":[13],"compared":[14],"to":[15,59,105],"specialized":[16,171],"tools":[17,172],"remains":[18],"unclear.":[19],"We":[20],"present":[21],"a":[22,70],"comprehensive":[23],"evaluation":[24],"of":[25,164,183,188],"state-of-the-art":[26],"LMMs":[27,48,74,184],"across":[28],"three":[29],"paradigms:":[30],"end-to-end":[31],"inference,":[32],"toolassisted":[33],"prompting,":[34],"and":[35,88,107,192],"autonomous":[36],"agent-based":[37],"tool":[38,125,146],"selection.":[39],"Through":[40],"our":[41],"benchmark":[42],"suite":[43],"MMToolSet,":[44],"we":[45],"assessed":[46],"leading":[47],"on":[49,116,151],"diverse":[50],"tasks":[52,79],"ranging":[53],"from":[54],"high-level":[55],"reasoning":[56],"(image-caption":[57],"matching)":[58],"precise":[60],"low-level":[61],"operations":[62],"(object":[63],"counting,":[64],"spatial":[65,86],"grounding).":[66],"Our":[67],"findings":[68],"reveal":[69],"nuanced":[71],"landscape:":[72],"while":[73,133],"excel":[75],"at":[76],"semantic":[77],"understanding":[78],"without":[80],"external":[81,128],"assistance,":[82],"they":[83],"struggle":[84],"with":[85,185],"precision":[87,187],"systematic":[89],"enumeration.":[90],"Tool":[91],"augmentation":[92],"yields":[93],"substantial":[94],"improvements":[95],"for":[96],"these":[97],"challenging":[98],"tasks,":[99],"reducing":[100],"counting":[101],"errors":[102],"by":[103],"up":[104],"50%":[106],"enabling":[108],"accurate":[109],"object":[110],"localization.":[111],"However,":[112],"success":[113],"depends":[114],"critically":[115],"model":[117,143],"sophistication:":[118],"advanced":[119],"models":[120],"like":[121],"ChatGPT-4o":[122],"demonstrate":[123],"selective":[124],"usage,":[126],"invoking":[127],"assistance":[129],"only":[130],"when":[131],"beneficial,":[132],"open-source":[134],"alternatives":[135],"often":[136],"misapply":[137],"tools,":[138],"degrading":[139],"performance.":[140],"Notably,":[141],"no":[142],"exhibits":[144],"dynamic":[145],"configuration":[147],"capabilities,":[148],"relying":[149],"instead":[150],"default":[152],"parameters":[153],"even":[154],"in":[155,169,174],"suboptimal":[156],"scenarios.":[157],"These":[158],"results":[159],"suggest":[160],"that":[161,178],"the":[162,181,186],"future":[163],"multimodal":[165],"AI":[166],"lies":[167],"not":[168],"replacing":[170],"but":[173],"developing":[175],"hybrid":[176],"systems":[177],"seamlessly":[179],"combine":[180],"generalization":[182],"task-specific":[189],"models.":[190],"Data":[191],"code":[193],"are":[194],"available":[195],"here":[196],"https://github.com/TLink666/MMToolSet.":[197]},"counts_by_year":[],"updated_date":"2025-12-30T23:08:21.542490","created_date":"2025-12-29T00:00:00"}
