{"id":"https://openalex.org/W4409264021","doi":"https://doi.org/10.1109/wacv61041.2025.00650","title":"MLLM-Tool: A Multimodal Large Language Model for Tool Agent Learning","display_name":"MLLM-Tool: A Multimodal Large Language Model for Tool Agent Learning","publication_year":2025,"publication_date":"2025-02-26","ids":{"openalex":"https://openalex.org/W4409264021","doi":"https://doi.org/10.1109/wacv61041.2025.00650"},"language":"en","primary_location":{"id":"doi:10.1109/wacv61041.2025.00650","is_oa":false,"landing_page_url":"https://doi.org/10.1109/wacv61041.2025.00650","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100385882","display_name":"Chenyu Wang","orcid":"https://orcid.org/0000-0003-0366-8113"},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chenyu Wang","raw_affiliation_strings":["ShanghaiTech University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ShanghaiTech University","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100532338","display_name":"Weixin Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Weixin Luo","raw_affiliation_strings":["Meituan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meituan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018400002","display_name":"Sixun Dong","orcid":null},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sixun Dong","raw_affiliation_strings":["ShanghaiTech University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ShanghaiTech University","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087118038","display_name":"Xiaohua Xuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaohua Xuan","raw_affiliation_strings":["UniDT Technology"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"UniDT Technology","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103153352","display_name":"Zhengxin Li","orcid":"https://orcid.org/0009-0002-8988-4545"},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhengxin Li","raw_affiliation_strings":["ShanghaiTech University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ShanghaiTech University","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017116858","display_name":"Lin Ma","orcid":"https://orcid.org/0000-0002-7331-6132"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin Ma","raw_affiliation_strings":["Meituan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meituan","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5034339267","display_name":"Shenghua Gao","orcid":"https://orcid.org/0000-0003-1626-2040"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Shenghua Gao","raw_affiliation_strings":["University of Hong Kong"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Hong Kong","institution_ids":["https://openalex.org/I889458895"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100385882"],"corresponding_institution_ids":["https://openalex.org/I30809798"],"apc_list":null,"apc_paid":null,"fwci":14.666,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.98625163,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"6678","last_page":"6687"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.9588000178337097,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10456","display_name":"Multi-Agent Systems and Negotiation","score":0.9588000178337097,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9520000219345093,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9344000220298767,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7743704915046692},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4276901185512543},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.41918838024139404},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4067743122577667}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7743704915046692},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4276901185512543},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.41918838024139404},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4067743122577667}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/wacv61041.2025.00650","is_oa":false,"landing_page_url":"https://doi.org/10.1109/wacv61041.2025.00650","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1670174335","display_name":null,"funder_award_id":"61932020,62172279","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W4285294723","https://openalex.org/W4308760226","https://openalex.org/W4386071707","https://openalex.org/W4389519587","https://openalex.org/W4390872747","https://openalex.org/W4390873054","https://openalex.org/W6778883912","https://openalex.org/W6796581206","https://openalex.org/W6809583738","https://openalex.org/W6810738896","https://openalex.org/W6849177959","https://openalex.org/W6849732303","https://openalex.org/W6850071225","https://openalex.org/W6850625674","https://openalex.org/W6851513886","https://openalex.org/W6851592950","https://openalex.org/W6851950068","https://openalex.org/W6852447913","https://openalex.org/W6852792400","https://openalex.org/W6852797390","https://openalex.org/W6853194520","https://openalex.org/W6853966477","https://openalex.org/W6854866820","https://openalex.org/W6855173278","https://openalex.org/W6855330075","https://openalex.org/W6855388516","https://openalex.org/W6856250771","https://openalex.org/W6856794988","https://openalex.org/W6860041859"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Recently,":[0],"the":[1,35,42,46,68,80,83,104,117,123,126,154,159,173],"astonishing":[2],"performance":[3],"of":[4,18,20,110,125,142,161,184],"large":[5],"language":[6,11],"models":[7],"(LLMs)":[8],"in":[9,64,66,82,88],"natural":[10],"comprehension":[12],"and":[13,99,114,164,192],"generation":[14],"tasks":[15],"triggered":[16],"lots":[17],"exploration":[19],"using":[21],"them":[22],"as":[23],"central":[24],"controllers":[25],"to":[26,37,40,50,56,75,158],"build":[27],"agent":[28],"systems.":[29],"Multiple":[30],"studies":[31],"focus":[32],"on":[33],"bridging":[34],"LLMs":[36,72,98,106],"external":[38],"tools":[39,136,187],"extend":[41],"application":[43],"scenarios.":[44],"However,":[45],"current":[47],"LLMs'":[48],"ability":[49],"perceive":[51],"tool":[52,119],"use":[53],"is":[54,145,182],"limited":[55],"a":[57,94,131],"single":[58],"text":[59],"query,":[60],"which":[61,167],"may":[62],"result":[63],"ambiguity":[65],"understanding":[67],"users'":[69],"real":[70],"intentions.":[71],"are":[73,194],"expected":[74],"eliminate":[76],"that":[77,103,146,179],"by":[78],"perceiving":[79],"information":[81],"visual-or":[84],"auditory-grounded":[85],"instructions.":[86,190],"Therefore,":[87],"this":[89],"paper,":[90],"we":[91,129],"propose":[92],"MLLM-Tool,":[93],"system":[95],"incorporating":[96],"open-source":[97],"multi-modal":[100,111,134,189],"encoders":[101],"so":[102],"learned":[105],"can":[107],"be":[108],"conscious":[109],"input":[112,135],"instruction":[113,156],"then":[115],"select":[116],"function-matched":[118],"correctly.":[120],"To":[121],"facilitate":[122],"evaluation":[124],"model's":[127],"capability,":[128],"collect":[130],"dataset":[132,144],"featuring":[133],"from":[137],"HuggingFace.":[138],"Another":[139],"essential":[140],"feature":[141],"our":[143,180],"it":[147],"also":[148],"contains":[149],"multiple":[150],"potential":[151,170],"choices":[152],"for":[153,172,188],"same":[155,174],"due":[157],"existence":[160],"identical":[162],"functions":[163],"synonymous":[165],"functions,":[166],"provides":[168],"more":[169],"solutions":[171],"query.":[175],"The":[176],"experiments":[177],"reveal":[178],"MLLM-Tool":[181],"capable":[183],"recommending":[185],"appropriate":[186],"Codes":[191],"data":[193],"available":[195],"at":[196],"github.com/MLLM-Tool/MLLM-Tool.":[197]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":3}],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2025-10-10T00:00:00"}
