{"id":"https://openalex.org/W7143372022","doi":"https://doi.org/10.48550/arxiv.2603.26653","title":"PerceptionComp: A Video Benchmark for Complex Perception-Centric Reasoning","display_name":"PerceptionComp: A Video Benchmark for Complex Perception-Centric Reasoning","publication_year":2026,"publication_date":"2026-03-27","ids":{"openalex":"https://openalex.org/W7143372022","doi":"https://doi.org/10.48550/arxiv.2603.26653"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.26653","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26653","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.26653","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130976927","display_name":"Shaoxuan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Shaoxuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130917348","display_name":"Zhixuan Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Zhixuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130926133","display_name":"Hanze Deng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Deng, Hanze","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130930686","display_name":"Zirun Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Zirun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130214784","display_name":"Shulin Tian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Shulin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130218391","display_name":"Zuyan Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zuyan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130978171","display_name":"Yushi Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Yushi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039135087","display_name":"Haoning Wu","orcid":"https://orcid.org/0009-0001-8717-338X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Haoning","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130920327","display_name":"Yuhao Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Yuhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130929677","display_name":"Benlin Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Benlin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130955465","display_name":"Ziwei Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ziwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130915188","display_name":"Ranjay Krishna","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Krishna, Ranjay","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5130976927"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7731000185012817,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7731000185012817,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.0989999994635582,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.021400000900030136,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.8292999863624573},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.6800000071525574},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.44200000166893005},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.3321000039577484},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.32910001277923584},{"id":"https://openalex.org/keywords/moment","display_name":"Moment (physics)","score":0.3222000002861023}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.8292999863624573},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7400000095367432},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.6800000071525574},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5759000182151794},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.44200000166893005},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4375},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3321000039577484},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3294999897480011},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.32910001277923584},{"id":"https://openalex.org/C179254644","wikidata":"https://www.wikidata.org/wiki/Q13222844","display_name":"Moment (physics)","level":2,"score":0.3222000002861023},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.31290000677108765},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.25859999656677246},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2524000108242035}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.26653","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26653","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.26653","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26653","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7528477907180786,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"introduce":[1],"PerceptionComp,":[2],"a":[3,171],"manually":[4],"annotated":[5],"benchmark":[6,68],"for":[7],"complex,":[8],"long-horizon,":[9],"perception-centric":[10,166],"video":[11,87,168],"reasoning.":[12,66,184],"PerceptionComp":[13,101,136,177],"is":[14,21,127],"designed":[15],"so":[16],"that":[17,100,165],"no":[18],"single":[19],"moment":[20],"sufficient:":[22],"answering":[23],"each":[24],"question":[25],"requires":[26,102],"multiple":[27],"temporally":[28],"separated":[29],"pieces":[30],"of":[31],"visual":[32,60],"evidence":[33],"and":[34,39,52,54,64,89,106,118,174],"compositional":[35],"constraints":[36],"under":[37],"conjunctive":[38],"sequential":[40],"logic,":[41],"spanning":[42],"perceptual":[43,183],"subtasks":[44],"such":[45],"as":[46],"objects,":[47],"attributes,":[48],"relations,":[49],"locations,":[50],"actions,":[51],"events,":[53],"requiring":[55],"skills":[56],"including":[57,80],"semantic":[58],"recognition,":[59],"correspondence,":[61],"temporal":[62],"reasoning,":[63],"spatial":[65],"The":[67],"contains":[69],"1,114":[70],"highly":[71],"complex":[72],"questions":[73],"on":[74,115,135,138],"279":[75],"videos":[76],"from":[77],"diverse":[78],"domains":[79],"city":[81],"walk":[82],"tours,":[83,86],"indoor":[84],"villa":[85],"games,":[88],"extreme":[90],"outdoor":[91],"sports,":[92],"with":[93],"100%":[94],"manual":[95],"annotation.":[96],"Human":[97],"studies":[98],"show":[99],"substantial":[103],"test-time":[104],"thinking":[105],"repeated":[107],"perception":[108],"steps:":[109],"participants":[110],"take":[111],"much":[112],"longer":[113],"than":[114,137],"prior":[116],"benchmarks,":[117],"accuracy":[119,151],"drops":[120],"to":[121],"near":[122],"chance":[123],"(18.97%)":[124],"when":[125],"rewatching":[126],"disallowed.":[128],"State-of-the-art":[129],"MLLMs":[130],"also":[131],"perform":[132],"substantially":[133],"worse":[134],"existing":[139],"benchmarks:":[140],"the":[141,153],"best":[142],"model":[143],"in":[144,152,182],"our":[145],"evaluation,":[146],"Gemini-3-Flash,":[147],"reaches":[148],"only":[149],"45.96%":[150],"five-choice":[154],"setting,":[155],"while":[156],"open-source":[157],"models":[158],"remain":[159],"below":[160],"40%.":[161],"These":[162],"results":[163],"suggest":[164],"long-horizon":[167],"reasoning":[169],"remains":[170],"major":[172],"bottleneck,":[173],"we":[175],"hope":[176],"will":[178],"help":[179],"drive":[180],"progress":[181]},"counts_by_year":[],"updated_date":"2026-03-31T06:07:48.031334","created_date":"2026-03-31T00:00:00"}
