{"id":"https://openalex.org/W7151779066","doi":"https://doi.org/10.48550/arxiv.2604.05015","title":"Video-MME-v2: Towards the Next Stage in Benchmarks for Comprehensive Video Understanding","display_name":"Video-MME-v2: Towards the Next Stage in Benchmarks for Comprehensive Video Understanding","publication_year":2026,"publication_date":"2026-04-06","ids":{"openalex":"https://openalex.org/W7151779066","doi":"https://doi.org/10.48550/arxiv.2604.05015"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.05015","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.05015","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.05015","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5014172220","display_name":"Chaoyou Fu","orcid":"https://orcid.org/0000-0002-0079-7668"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Chaoyou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112644306","display_name":"Haozhi Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Haozhi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133154005","display_name":"Yuhao Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Yuhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133212667","display_name":"Yi-Fan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yi-Fan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133164386","display_name":"Yunhang Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Yunhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133218364","display_name":"Xiaoxing Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Xiaoxing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133148187","display_name":"Xueying Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xueying","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133198948","display_name":"Jinsen Su","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Jinsen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112909169","display_name":"Chengwu Long","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Long, Chengwu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126429187","display_name":"Xiaoyao Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Xiaoyao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133170242","display_name":"Yongkang Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Yongkang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054226277","display_name":"Xiawu Zheng","orcid":"https://orcid.org/0000-0002-6855-5403"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Xiawu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133191858","display_name":"Xue Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Xue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133198006","display_name":"Haoyu Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Haoyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133218389","display_name":"Yunsheng Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yunsheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133181647","display_name":"Ziwei Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ziwei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133189391","display_name":"Xing Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Xing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133175452","display_name":"Caifeng Shan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shan, Caifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133207336","display_name":"Ran He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Ran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":19,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9707000255584717,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9707000255584717,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.003700000001117587,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.003000000026077032,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.6330000162124634},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.6299999952316284},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6054999828338623},{"id":"https://openalex.org/keywords/information-bottleneck-method","display_name":"Information bottleneck method","score":0.5622000098228455},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.45660001039505005},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.43560001254081726},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.38530001044273376},{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.38429999351501465},{"id":"https://openalex.org/keywords/coherence","display_name":"Coherence (philosophical gambling strategy)","score":0.3564000129699707}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8579000234603882},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.6330000162124634},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.6299999952316284},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6054999828338623},{"id":"https://openalex.org/C60008888","wikidata":"https://www.wikidata.org/wiki/Q6031013","display_name":"Information bottleneck method","level":3,"score":0.5622000098228455},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4645000100135803},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.45660001039505005},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.43560001254081726},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4002000093460083},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.38530001044273376},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.38429999351501465},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.3564000129699707},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.323199987411499},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.3190000057220459},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.3188999891281128},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.31610000133514404},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.3052000105381012},{"id":"https://openalex.org/C103910844","wikidata":"https://www.wikidata.org/wiki/Q2631256","display_name":"Video quality","level":3,"score":0.30079999566078186},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.2953000068664551},{"id":"https://openalex.org/C115051666","wikidata":"https://www.wikidata.org/wiki/Q6522493","display_name":"Ranging","level":2,"score":0.29510000348091125},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.28610000014305115},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.27959999442100525},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.26899999380111694},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2669000029563904},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.26589998602867126},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.25769999623298645},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.2574000060558319},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.2558000087738037}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.05015","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.05015","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.05015","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.05015","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"With":[0],"the":[1,40,61,164,238],"rapid":[2],"advancement":[3],"of":[4,44,63,154,163,240],"video":[5,45,64,167,242],"understanding,":[6],"existing":[7],"benchmarks":[8],"are":[9],"becoming":[10],"increasingly":[11],"saturated,":[12],"exposing":[13,228],"a":[14,33,54,91,132,172,185,233],"critical":[15],"discrepancy":[16],"between":[17,175],"inflated":[18],"leaderboard":[19],"scores":[20],"and":[21,42,76,103,114,141,149,180,183,195],"real-world":[22],"model":[23,50,178],"capabilities.":[24],"To":[25,47,124],"address":[26],"this":[27],"widening":[28],"gap,":[29],"we":[30,52,89],"introduce":[31],"Video-MME-v2,":[32],"comprehensive":[34],"benchmark":[35],"designed":[36],"to":[37,72,78,85,118,151,159,199],"rigorously":[38,133],"evaluate":[39,49],"robustness":[41],"faithfulness":[43],"understanding.":[46],"systematically":[48],"capabilities,":[51],"design":[53],"\\textbf{progressive":[55],"tri-level":[56],"hierarchy}":[57],"that":[58,96,206],"incrementally":[59],"increases":[60],"complexity":[62],"comprehension,":[65],"ranging":[66],"from":[67],"multi-point":[68],"visual":[69,192,225],"information":[70,193],"aggregation,":[71],"temporal":[73,196],"dynamics":[74],"modeling,":[75],"ultimately":[77],"complex":[79],"multimodal":[80],"reasoning.":[81,107,123,202],"Besides,":[82],"in":[83,105,191,223],"contrast":[84],"conventional":[86],"per-question":[87],"accuracy,":[88],"propose":[90],"\\textbf{group-based":[92],"non-linear":[93],"evaluation}":[94],"strategy":[95],"enforces":[97],"both":[98],"consistency":[99],"across":[100],"related":[101],"queries":[102],"coherence":[104],"multi-step":[106],"It":[108],"penalizes":[109],"fragmented":[110],"or":[111],"guess-based":[112],"correctness":[113],"assigns":[115],"credit":[116],"only":[117],"answers":[119],"supported":[120],"by":[121,146],"valid":[122],"guarantee":[125],"data":[126],"quality,":[127],"Video-MME-v2":[128,157,231],"is":[129,209],"constructed":[130],"through":[131],"controlled":[134],"human":[135,181],"annotation":[136],"pipeline,":[137],"involving":[138],"12":[139],"annotators":[140],"50":[142],"independent":[143],"reviewers.":[144],"Backed":[145],"\\textbf{3,300":[147],"human-hours}":[148],"up":[150],"\\textbf{5":[152],"rounds}":[153],"quality":[155],"assurance,":[156],"aims":[158],"serve":[160],"as":[161],"one":[162],"most":[165],"authoritative":[166],"benchmarks.":[168],"Extensive":[169],"experiments":[170],"reveal":[171],"substantial":[173],"gap":[174],"current":[176],"best":[177],"Gemini-3-Pro":[179],"experts,":[182],"uncover":[184],"clear":[186],"hierarchical":[187],"bottleneck":[188],"where":[189],"errors":[190],"aggregation":[194],"modeling":[197],"propagate":[198],"limit":[200],"high-level":[201],"We":[203],"further":[204],"find":[205],"thinking-based":[207],"reasoning":[208],"highly":[210],"dependent":[211],"on":[212],"textual":[213],"cues,":[214],"improving":[215],"performance":[216],"with":[217],"subtitles":[218],"but":[219],"sometimes":[220],"degrading":[221],"it":[222],"purely":[224],"settings.":[226],"By":[227],"these":[229],"limitations,":[230],"establishes":[232],"demanding":[234],"new":[235],"testbed":[236],"for":[237],"development":[239],"next-generation":[241],"MLLMs.":[243]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-09T00:00:00"}
