{"id":"https://openalex.org/W4391124424","doi":"https://doi.org/10.48550/arxiv.2401.10727","title":"MLLM-Tool: A Multimodal Large Language Model For Tool Agent Learning","display_name":"MLLM-Tool: A Multimodal Large Language Model For Tool Agent Learning","publication_year":2024,"publication_date":"2024-01-19","ids":{"openalex":"https://openalex.org/W4391124424","doi":"https://doi.org/10.48550/arxiv.2401.10727"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2401.10727","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2401.10727","pdf_url":"https://arxiv.org/pdf/2401.10727","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2401.10727","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120779465","display_name":"Chenyu Wang","orcid":"https://orcid.org/0009-0002-8648-460X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Chenyu","raw_affiliation_strings":["Michael"],"affiliations":[{"raw_affiliation_string":"Michael","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013605789","display_name":"Weixin Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Weixin","raw_affiliation_strings":["Michael"],"affiliations":[{"raw_affiliation_string":"Michael","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015681671","display_name":"Qianyu Chen","orcid":"https://orcid.org/0000-0002-1273-7988"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Sixun","raw_affiliation_strings":["Michael"],"affiliations":[{"raw_affiliation_string":"Michael","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093681693","display_name":"Haonan Mai","orcid":"https://orcid.org/0009-0002-7861-4072"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xuan, Xiaohua","raw_affiliation_strings":["Michael"],"affiliations":[{"raw_affiliation_string":"Michael","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109676538","display_name":"Jindi Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Zhengxin","raw_affiliation_strings":["Michael"],"affiliations":[{"raw_affiliation_string":"Michael","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018400002","display_name":"Sixun Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Lin","raw_affiliation_strings":["Michael"],"affiliations":[{"raw_affiliation_string":"Michael","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5109007234","display_name":"Xiaohua","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Shenghua","raw_affiliation_strings":["Michael"],"affiliations":[{"raw_affiliation_string":"Michael","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5120779465"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9685999751091003,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7192314863204956},{"id":"https://openalex.org/keywords/ambiguity","display_name":"Ambiguity","score":0.6635969877243042},{"id":"https://openalex.org/keywords/comprehension","display_name":"Comprehension","score":0.556918740272522},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5487440824508667},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.43407565355300903},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.42539146542549133},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4170718193054199},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.3983824551105499},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3714721202850342},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3392256200313568},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.15560993552207947},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.131597101688385}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7192314863204956},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.6635969877243042},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.556918740272522},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5487440824508667},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43407565355300903},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.42539146542549133},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4170718193054199},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3983824551105499},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3714721202850342},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3392256200313568},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.15560993552207947},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.131597101688385},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2401.10727","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2401.10727","pdf_url":"https://arxiv.org/pdf/2401.10727","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2401.10727","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2401.10727","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2401.10727","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2401.10727","pdf_url":"https://arxiv.org/pdf/2401.10727","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.8100000023841858,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4391124424.pdf","grobid_xml":"https://content.openalex.org/works/W4391124424.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2353179089","https://openalex.org/W2923538289","https://openalex.org/W2353125546","https://openalex.org/W2470643824","https://openalex.org/W2349635380","https://openalex.org/W4353089801","https://openalex.org/W2353819554","https://openalex.org/W2359488321","https://openalex.org/W1516679419","https://openalex.org/W190396239"],"abstract_inverted_index":{"Recently,":[0],"the":[1,35,42,46,68,80,83,105,118,124,127,155,160,174],"astonishing":[2],"performance":[3],"of":[4,18,20,111,126,143,162,185],"large":[5],"language":[6,11],"models":[7],"(LLMs)":[8],"in":[9,64,66,82,89],"natural":[10],"comprehension":[12],"and":[13,100,115,165,193],"generation":[14],"tasks":[15],"triggered":[16],"lots":[17],"exploration":[19],"using":[21],"them":[22],"as":[23],"central":[24],"controllers":[25],"to":[26,37,40,50,56,75,159],"build":[27],"agent":[28],"systems.":[29],"Multiple":[30],"studies":[31],"focus":[32],"on":[33],"bridging":[34],"LLMs":[36,72,99,107],"external":[38],"tools":[39,137,188],"extend":[41],"application":[43],"scenarios.":[44],"However,":[45],"current":[47],"LLMs'":[48],"ability":[49],"perceive":[51],"tool":[52,120],"use":[53],"is":[54,146,183],"limited":[55],"a":[57,95,132],"single":[58],"text":[59],"query,":[60],"which":[61,168],"may":[62],"result":[63],"ambiguity":[65],"understanding":[67],"users'":[69],"real":[70],"intentions.":[71],"are":[73,195],"expected":[74],"eliminate":[76],"that":[77,104,147,180],"by":[78],"perceiving":[79],"information":[81],"visual-":[84],"or":[85],"auditory-grounded":[86],"instructions.":[87,191],"Therefore,":[88],"this":[90],"paper,":[91],"we":[92,130],"propose":[93],"MLLM-Tool,":[94],"system":[96],"incorporating":[97],"open-source":[98],"multi-modal":[101,112,135,190],"encoders":[102],"so":[103],"learned":[106],"can":[108],"be":[109],"conscious":[110],"input":[113,136],"instruction":[114,157],"then":[116],"select":[117],"function-matched":[119],"correctly.":[121],"To":[122],"facilitate":[123],"evaluation":[125],"model's":[128],"capability,":[129],"collect":[131],"dataset":[133,145],"featuring":[134],"from":[138],"HuggingFace.":[139],"Another":[140],"essential":[141],"feature":[142],"our":[144,181],"it":[148],"also":[149],"contains":[150],"multiple":[151],"potential":[152,171],"choices":[153],"for":[154,173,189],"same":[156,175],"due":[158],"existence":[161],"identical":[163],"functions":[164],"synonymous":[166],"functions,":[167],"provides":[169],"more":[170],"solutions":[172],"query.":[176],"The":[177],"experiments":[178],"reveal":[179],"MLLM-Tool":[182],"capable":[184],"recommending":[186],"appropriate":[187],"Codes":[192],"data":[194],"available":[196],"at":[197],"https://github.com/MLLM-Tool/MLLM-Tool.":[198]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2025-10-10T00:00:00"}
