{"id":"https://openalex.org/W6888435638","doi":"https://doi.org/10.21227/pvmd-s489","title":"MM-Vet v2: A Challenging Benchmark to Evaluate Large Multimodal Models for Integrated Capabilities","display_name":"MM-Vet v2: A Challenging Benchmark to Evaluate Large Multimodal Models for Integrated Capabilities","publication_year":2024,"publication_date":"2024-12-21","ids":{"openalex":"https://openalex.org/W6888435638","doi":"https://doi.org/10.21227/pvmd-s489"},"language":"en","primary_location":{"id":"doi:10.21227/pvmd-s489","is_oa":true,"landing_page_url":"https://doi.org/10.21227/pvmd-s489","pdf_url":null,"source":{"id":"https://openalex.org/S7407051695","display_name":"IEEE DataPort","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"dataset"},"type":"dataset","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.21227/pvmd-s489","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yu, Weihao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Weihao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yang, Zhengyuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zhengyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Ren, Lingfeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ren, Lingfeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Linjie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Linjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Jianfeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jianfeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Lin, Kevin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Kevin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Lin, Chung-Ching","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Chung-Ching","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Liu, Zicheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zicheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Lijuan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Lijuan Wang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Wang, Xinchao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xinchao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":null,"topics":[],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.8460999727249146},{"id":"https://openalex.org/keywords/core","display_name":"Core (optical fiber)","score":0.4918000102043152},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.4867999851703644},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.3959999978542328},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.3603000044822693},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.3061000108718872}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.8460999727249146},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7071999907493591},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.6258000135421753},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5530999898910522},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.4918000102043152},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.4867999851703644},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3959999978542328},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.3603000044822693},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.3061000108718872},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.26980000734329224},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26570001244544983},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.25940001010894775},{"id":"https://openalex.org/C2780366209","wikidata":"https://www.wikidata.org/wiki/Q5170200","display_name":"Core model","level":2,"score":0.2531999945640564}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21227/pvmd-s489","is_oa":true,"landing_page_url":"https://doi.org/10.21227/pvmd-s489","pdf_url":null,"source":{"id":"https://openalex.org/S7407051695","display_name":"IEEE DataPort","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"dataset"}],"best_oa_location":{"id":"doi:10.21227/pvmd-s489","is_oa":true,"landing_page_url":"https://doi.org/10.21227/pvmd-s489","pdf_url":null,"source":{"id":"https://openalex.org/S7407051695","display_name":"IEEE DataPort","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"dataset"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,175],"propose":[1,150],"MM-Vet":[2,98,128,180],"v2,":[3,99,181],"an":[4,151],"evaluation":[5,51,71,147,161],"benchmark":[6,52],"that":[7,73,105,183],"examines":[8,136],"large":[9],"multimodal":[10,15,65],"models":[11],"(LMMs)":[12],"on":[13,30,102,179],"complicated":[14,64,111],"tasks.":[16],"Recent":[17],"LMMs":[18,178],"have":[19],"shown":[20],"various":[21],"intriguing":[22,107],"abilities,":[23],"such":[24],"as":[25],"solving":[26],"math":[27],"problems":[28],"written":[29],"the":[31,63,103,106,137,160,188,220],"blackboard,":[32],"reasoning":[33],"about":[34],"events":[35],"and":[36,41,61,78,81,135,166,213,219],"celebrities":[37],"in":[38,170],"news":[39],"images,":[40],"explaining":[42],"visual":[43],"jokes.":[44],"Rapid":[45],"model":[46,86,118,190],"advancements":[47],"pose":[48],"challenges":[49],"to":[50,58,69,84,109,121],"development.":[53],"Problems":[54],"include:":[55],"(1)":[56],"How":[57,68,83],"systematically":[59],"structure":[60],"evaluate":[62,176],"tasks;":[66],"(2)":[67],"design":[70],"metrics":[72],"work":[74],"well":[75],"across":[76,162],"question":[77,164],"answer":[79,167],"types;":[80],"(3)":[82],"give":[85],"insights":[87],"beyond":[88],"a":[89,116,171,192,208],"simple":[90],"performance":[91],"ranking.":[92],"To":[93],"this":[94],"end,":[95],"we":[96,149],"present":[97],"designed":[100],"based":[101],"insight":[104],"ability":[108],"solve":[110],"tasks":[112],"often":[113],"stems":[114],"from":[115,143],"generalist":[117],"being":[119],"able":[120],"integrate":[122],"different":[123,163],"core":[124,132],"vision-language":[125],"(VL)":[126],"capabilities.":[127],"v2":[129],"defines":[130],"7":[131],"VL":[133],"capabilities":[134],"39":[138],"integrations":[139],"of":[140,194,210],"interest":[141],"derived":[142],"their":[144],"combinations.":[145],"For":[146],"metrics,":[148],"LLM-based":[152],"evaluator":[153,158,222],"for":[154],"open-ended":[155],"outputs.":[156],"The":[157],"enables":[159],"types":[165],"styles,":[168],"resulting":[169],"unified":[172],"scoring":[173],"metric.":[174],"representative":[177],"finding":[182],"Claude":[184],"3.5":[185],"Sonnet":[186],"is":[187],"best":[189],"with":[191,207],"score":[193,209],"71.8,":[195],"slightly":[196],"outperforming":[197],"GPT-4o":[198],"which":[199],"scored":[200],"71.0.":[201],"Among":[202],"open-weight":[203],"models,":[204],"InternVL2-Llama3-76B":[205],"leads":[206],"68.4.":[211],"Code":[212],"data":[214],"are":[215],"available":[216],"at":[217,223],"https://github.com/yuweihao/MM-Vet,":[218],"online":[221],"https://huggingface.co/spaces/whyu/MM-Vet-v2_Evaluator.":[224]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
