{"id":"https://openalex.org/W7161676630","doi":"https://doi.org/10.48550/arxiv.2605.18577","title":"OmniPro: A Comprehensive Benchmark for Omni-Proactive Streaming Video Understanding","display_name":"OmniPro: A Comprehensive Benchmark for Omni-Proactive Streaming Video Understanding","publication_year":2026,"publication_date":"2026-05-18","ids":{"openalex":"https://openalex.org/W7161676630","doi":"https://doi.org/10.48550/arxiv.2605.18577"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.18577","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18577","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.18577","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136473534","display_name":"Ruixiang Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Ruixiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136501569","display_name":"Jie Yang","orcid":"https://orcid.org/0000-0002-2948-2088"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Jie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113398435","display_name":"Zijie Xin","orcid":"https://orcid.org/0000-0002-9220-8735"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xin, Zijie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136497342","display_name":"Tianyi Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Tianyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076304703","display_name":"Fengyun Rao","orcid":"https://orcid.org/0000-0002-2868-2088"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rao, Fengyun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136490325","display_name":"Jing Lyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"LYU, Jing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5060270456","display_name":"Xirong Li","orcid":"https://orcid.org/0000-0002-0220-8310"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xirong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7328000068664551,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7328000068664551,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.07729999721050262,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.02710000053048134,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7401999831199646},{"id":"https://openalex.org/keywords/polling","display_name":"Polling","score":0.7146999835968018},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.6779000163078308},{"id":"https://openalex.org/keywords/mode","display_name":"Mode (computer interface)","score":0.448199987411499},{"id":"https://openalex.org/keywords/cover","display_name":"Cover (algebra)","score":0.4163999855518341},{"id":"https://openalex.org/keywords/sample","display_name":"Sample (material)","score":0.40209999680519104},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.3756999969482422},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.3628000020980835}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8359000086784363},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7401999831199646},{"id":"https://openalex.org/C204854418","wikidata":"https://www.wikidata.org/wiki/Q1362921","display_name":"Polling","level":2,"score":0.7146999835968018},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.6779000163078308},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.448199987411499},{"id":"https://openalex.org/C2780428219","wikidata":"https://www.wikidata.org/wiki/Q16952335","display_name":"Cover (algebra)","level":2,"score":0.4163999855518341},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.40209999680519104},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.3871999979019165},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3785000145435333},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.3756999969482422},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.3628000020980835},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.3391999900341034},{"id":"https://openalex.org/C2986160907","wikidata":"https://www.wikidata.org/wiki/Q220499","display_name":"Video streaming","level":2,"score":0.3140000104904175},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.295199990272522},{"id":"https://openalex.org/C2781215313","wikidata":"https://www.wikidata.org/wiki/Q3493345","display_name":"SPARK (programming language)","level":2,"score":0.29260000586509705},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.28790000081062317},{"id":"https://openalex.org/C103910844","wikidata":"https://www.wikidata.org/wiki/Q2631256","display_name":"Video quality","level":3,"score":0.27889999747276306},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.27810001373291016},{"id":"https://openalex.org/C2988167200","wikidata":"https://www.wikidata.org/wiki/Q16885149","display_name":"Online video","level":2,"score":0.274399995803833},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2702000141143799},{"id":"https://openalex.org/C182365436","wikidata":"https://www.wikidata.org/wiki/Q50701","display_name":"Variable (mathematics)","level":2,"score":0.2655999958515167},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.26409998536109924},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.26249998807907104},{"id":"https://openalex.org/C68387754","wikidata":"https://www.wikidata.org/wiki/Q7271585","display_name":"Schedule","level":2,"score":0.2581999897956848},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2574999928474426},{"id":"https://openalex.org/C22561748","wikidata":"https://www.wikidata.org/wiki/Q854954","display_name":"Videoconferencing","level":2,"score":0.25380000472068787}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.18577","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18577","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.18577","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18577","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Omni-proactive":[0],"streaming":[1,66,166],"video":[2,83,101],"understanding,":[3],"i.e.,":[4],"autonomously":[5,160],"deciding":[6],"when":[7,162],"to":[8,12,74,122,159,163],"speak":[9],"and":[10,51,62,81,94,114,144,198],"what":[11],"say":[13],"from":[14],"continuous":[15],"audio-visual":[16],"streams,":[17],"is":[18,117],"an":[19],"emerging":[20],"capability":[21],"of":[22,47,57,64,106],"omni-modal":[23,77],"large":[24],"language":[25],"models.":[26,67],"Existing":[27],"benchmarks":[28],"fall":[29],"short":[30],"in":[31,165],"three":[32,173],"key":[33,174],"aspects:":[34],"they":[35],"rely":[36],"primarily":[37],"on":[38],"visual":[39],"signals,":[40],"adopt":[41],"polling":[42],"or":[43,112],"fixed-timestamp":[44],"protocols":[45],"instead":[46],"true":[48],"proactive":[49,79,154],"evaluation,":[50],"cover":[52],"only":[53],"a":[54,130],"limited":[55,195],"range":[56],"tasks,":[58],"preventing":[59],"reliable":[60],"assessment":[61],"differentiation":[63],"omni-proactive":[65],"We":[68,127],"present":[69],"OmniPro,":[70],"the":[71,141,204],"first":[72],"benchmark":[73],"jointly":[75],"evaluate":[76],"perception,":[78],"responding,":[80],"diverse":[82],"understanding":[84,102,138],"tasks.":[85],"It":[86],"comprises":[87],"2,700":[88],"human-verified":[89],"samples":[90,107],"spanning":[91],"9":[92],"sub-tasks":[93],"3":[95],"cognitive":[96],"levels,":[97],"covering":[98],"6":[99],"basic":[100],"capabilities.":[103],"Notably,":[104],"84%":[105],"require":[108],"audio":[109,177,201],"signals":[110],"(speech":[111],"non-speech),":[113],"each":[115,146],"sample":[116],"annotated":[118],"with":[119,182],"modality-isolation":[120],"labels":[121],"enable":[123],"fine-grained":[124],"multimodal":[125],"analysis.":[126],"further":[128],"introduce":[129],"dual-mode":[131],"evaluation":[132],"protocol:":[133],"Probe":[134],"mode":[135,151],"assesses":[136],"content":[137],"by":[139,156],"querying":[140],"model":[142],"before":[143],"after":[145],"ground-truth":[147],"trigger,":[148],"while":[149],"Online":[150],"evaluates":[152],"full":[153],"ability":[155],"requiring":[157],"models":[158,171],"decide":[161],"respond":[164],"input.":[167],"Evaluating":[168],"11":[169],"representative":[170],"reveals":[172],"findings:":[175],"(1)":[176],"provides":[178],"consistent":[179],"gains":[180],"but":[181],"highly":[183],"variable":[184],"utilization":[185],"across":[186],"models,":[187],"(2)":[188],"performance":[189],"degrades":[190],"significantly":[191],"over":[192],"time,":[193],"indicating":[194],"long-horizon":[196],"robustness,":[197],"(3)":[199],"non-speech":[200],"perception":[202],"remains":[203],"weakest":[205],"dimension.":[206]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-20T00:00:00"}
