{"id":"https://openalex.org/W4404343226","doi":"https://doi.org/10.48550/arxiv.2410.23266","title":"TOMATO: Assessing Visual Temporal Reasoning Capabilities in Multimodal Foundation Models","display_name":"TOMATO: Assessing Visual Temporal Reasoning Capabilities in Multimodal Foundation Models","publication_year":2024,"publication_date":"2024-10-30","ids":{"openalex":"https://openalex.org/W4404343226","doi":"https://doi.org/10.48550/arxiv.2410.23266"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2410.23266","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.23266","pdf_url":"https://arxiv.org/pdf/2410.23266","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2410.23266","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114634116","display_name":"Ziyao Shangguan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shangguan, Ziyao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108907561","display_name":"Chuhan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chuhan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067367860","display_name":"Yuxuan Ding","orcid":"https://orcid.org/0000-0003-0172-1857"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Yuxuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016643106","display_name":"Yanan Zheng","orcid":"https://orcid.org/0000-0002-9823-0191"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Yanan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047416722","display_name":"Yilun Zhao","orcid":"https://orcid.org/0000-0002-6812-5120"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yilun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044374534","display_name":"Tesca Fitzgerald","orcid":"https://orcid.org/0000-0003-0867-0546"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fitzgerald, Tesca","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5064858748","display_name":"Arman Cohan","orcid":"https://orcid.org/0000-0002-8954-2724"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cohan, Arman","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5114634116"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9751999974250793,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9751999974250793,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.973800003528595,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10757","display_name":"Geographic Information Systems Studies","score":0.9627000093460083,"subfield":{"id":"https://openalex.org/subfields/3305","display_name":"Geography, Planning and Development"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/foundation","display_name":"Foundation (evidence)","score":0.8454426527023315},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4423374831676483},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3844011127948761},{"id":"https://openalex.org/keywords/cognitive-science","display_name":"Cognitive science","score":0.34395909309387207},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.27106550335884094},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.1691908836364746}],"concepts":[{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.8454426527023315},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4423374831676483},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3844011127948761},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.34395909309387207},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.27106550335884094},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.1691908836364746},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2410.23266","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.23266","pdf_url":"https://arxiv.org/pdf/2410.23266","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2410.23266","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2410.23266","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2410.23266","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.23266","pdf_url":"https://arxiv.org/pdf/2410.23266","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2381393187","https://openalex.org/W2332779545","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W2358060160","https://openalex.org/W2035483685"],"abstract_inverted_index":{"Existing":[0],"benchmarks":[1,36],"often":[2],"highlight":[3],"the":[4,25,163,210,218,231],"remarkable":[5],"performance":[6,158],"achieved":[7],"by":[8,52],"state-of-the-art":[9],"Multimodal":[10,95],"Foundation":[11],"Models":[12],"(MFMs)":[13],"in":[14,108,177,186],"leveraging":[15],"temporal":[16,30,65,105],"context":[17],"for":[18,208],"video":[19,109,232],"understanding.":[20,110],"However,":[21],"how":[22],"well":[23],"do":[24],"models":[26],"truly":[27],"perform":[28],"visual":[29,64,133],"reasoning?":[31],"Our":[32,152],"study":[33],"of":[34,41,160,225],"existing":[35],"shows":[37],"that":[38,145],"this":[39,175],"capability":[40],"MFMs":[42,212],"is":[43],"likely":[44],"overestimated":[45],"as":[46,195,204,214],"many":[47],"questions":[48,117],"can":[49,182],"be":[50],"solved":[51],"using":[53],"a":[54,97,156,196,205,215],"single,":[55],"few,":[56],"or":[57],"out-of-order":[58],"frames.":[59],"To":[60],"systematically":[61],"examine":[62],"current":[63,178],"reasoning":[66,106],"tasks,":[67],"we":[68,90],"propose":[69],"three":[70],"principles":[71],"with":[72,162],"corresponding":[73],"metrics:":[74],"(1)":[75],"Multi-Frame":[76],"Gain,":[77],"(2)":[78],"Frame":[79,84],"Order":[80],"Sensitivity,":[81],"and":[82,132,142,149,213],"(3)":[83],"Information":[85],"Disparity.":[86],"Following":[87],"these":[88,193],"principles,":[89],"introduce":[91],"TOMATO,":[92],"Temporal":[93],"Reasoning":[94],"Evaluation,":[96],"novel":[98],"benchmark":[99],"crafted":[100],"to":[101,136,191,217,220],"rigorously":[102],"assess":[103],"MFMs'":[104],"capabilities":[107],"TOMATO":[111,201],"comprises":[112],"1,484":[113],"carefully":[114],"curated,":[115],"human-annotated":[116],"spanning":[118],"six":[119],"tasks":[120],"(i.e.,":[121],"action":[122],"count,":[123],"direction,":[124],"rotation,":[125],"shape":[126],"&amp;":[127,130],"trend,":[128],"velocity":[129],"frequency,":[131],"cues),":[134],"applied":[135],"1,417":[137],"videos,":[138,144],"including":[139],"805":[140],"self-recorded":[141],"-generated":[143],"encompass":[146],"human-centric,":[147],"real-world,":[148],"simulated":[150],"scenarios.":[151],"comprehensive":[153],"evaluation":[154],"reveals":[155],"human-model":[157],"gap":[159,176],"57.3%":[161],"best-performing":[164],"model.":[165],"Moreover,":[166],"our":[167],"in-depth":[168],"analysis":[169],"uncovers":[170],"more":[171],"fundamental":[172],"limitations":[173],"beyond":[174],"MFMs.":[179],"While":[180],"they":[181,189],"accurately":[183],"recognize":[184],"events":[185],"isolated":[187],"frames,":[188],"fail":[190],"interpret":[192],"frames":[194],"continuous":[197],"sequence.":[198],"We":[199],"believe":[200],"will":[202],"serve":[203],"crucial":[206],"testbed":[207],"evaluating":[209],"next-generation":[211],"call":[216],"community":[219],"develop":[221],"AI":[222],"systems":[223],"capable":[224],"comprehending":[226],"human":[227],"world":[228],"dynamics":[229],"through":[230],"modality.":[233]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2025-10-10T00:00:00"}
