{"id":"https://openalex.org/W7157064814","doi":"https://doi.org/10.48550/arxiv.2604.23407","title":"PushupBench: Your VLM is not good at counting pushups","display_name":"PushupBench: Your VLM is not good at counting pushups","publication_year":2026,"publication_date":"2026-04-25","ids":{"openalex":"https://openalex.org/W7157064814","doi":"https://doi.org/10.48550/arxiv.2604.23407"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.23407","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23407","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.23407","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134774123","display_name":"Shengzhi Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Shengzhi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068702937","display_name":"Jiarun Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jiarun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134794130","display_name":"Karun Sharma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sharma, Karun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134794984","display_name":"Jiaqi Su","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Jiaqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5034495880","display_name":"Shichao Pei","orcid":"https://orcid.org/0000-0002-0802-1506"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pei, Shichao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.84170001745224,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.84170001745224,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.0658000037074089,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.04690000042319298,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.6208999752998352},{"id":"https://openalex.org/keywords/proxy","display_name":"Proxy (statistics)","score":0.5149000287055969},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.42590001225471497},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.32409998774528503},{"id":"https://openalex.org/keywords/clips","display_name":"CLIPS","score":0.32199999690055847},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.31049999594688416}],"concepts":[{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.6208999752998352},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5471000075340271},{"id":"https://openalex.org/C2780148112","wikidata":"https://www.wikidata.org/wiki/Q1432581","display_name":"Proxy (statistics)","level":2,"score":0.5149000287055969},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5067999958992004},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.42590001225471497},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.32409998774528503},{"id":"https://openalex.org/C2778739407","wikidata":"https://www.wikidata.org/wiki/Q165372","display_name":"CLIPS","level":2,"score":0.32199999690055847},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.32089999318122864},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.31049999594688416},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.30959999561309814},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2865000069141388},{"id":"https://openalex.org/C2776141515","wikidata":"https://www.wikidata.org/wiki/Q1274479","display_name":"Repetition (rhetorical device)","level":2,"score":0.28540000319480896},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.2770000100135803},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.2711000144481659},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.2639000117778778},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.25870001316070557},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.23407","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23407","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.23407","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23407","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"vision-language":[1],"models":[2,39,53],"(VLMs)":[3],"can":[4],"recognize":[5],"\\textit{what}":[6],"happens":[7],"in":[8,89],"video":[9,71],"but":[10],"fail":[11],"to":[12,69],"count":[13,57],"\\textit{how":[14],"many}":[15],"times.":[16],"We":[17,45],"introduce":[18],"\\textbf{PushupBench},":[19],"446":[20],"long-form":[21],"clips":[22],"(avg.":[23],"36.7s)":[24],"for":[25,84],"evaluating":[26],"repetition":[27],"counting.":[28],"The":[29],"best":[30],"frontier":[31],"model":[32],"achieves":[33],"42.1\\%":[34],"exact":[35],"accuracy;":[36],"open-source":[37],"4B":[38],"score":[40],"$\\sim$6\\%,":[41],"matching":[42],"supervised":[43],"baselines.":[44],"show":[46],"that":[47],"accuracy":[48],"alone":[49],"misleads":[50],"--":[51],"weaker":[52],"exploit":[54],"the":[55],"modal":[56],"rather":[58],"than":[59],"reason":[60],"temporally.":[61],"Fine-tuning":[62],"on":[63,94],"counting":[64,80],"with":[65],"1k":[66],"samples":[67],"transfers":[68],"general":[70],"understanding:":[72],"MVBench":[73],"(+2.15),":[74],"PerceptionTest":[75],"(+1.88),":[76],"TVBench":[77],"(+4.54),":[78],"suggesting":[79],"is":[81],"a":[82],"proxy":[83],"broader":[85],"temporal":[86],"reasoning.PushupBench":[87],"incorporated":[88],"\\texttt{lmms-eval}":[90],"(https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/1262)":[91],"and":[92],"hosted":[93],"(pushupbench.com/)":[95]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-29T00:00:00"}
