{"id":"https://openalex.org/W7128511036","doi":"https://doi.org/10.48550/arxiv.2602.08448","title":"Vista: Scene-Aware Optimization for Streaming Video Question Answering under Post-Hoc Queries","display_name":"Vista: Scene-Aware Optimization for Streaming Video Question Answering under Post-Hoc Queries","publication_year":2026,"publication_date":"2026-02-09","ids":{"openalex":"https://openalex.org/W7128511036","doi":"https://doi.org/10.48550/arxiv.2602.08448"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.08448","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5121261102","display_name":"Haocheng Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Haocheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125501721","display_name":"Nan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Nan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125558427","display_name":"Wei Tao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tao, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100940847","display_name":"Xiaoyang Qu","orcid":"https://orcid.org/0009-0009-6311-4332"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qu, Xiaoyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009868061","display_name":"Guokuan Li","orcid":"https://orcid.org/0009-0005-7998-5520"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Guokuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044866795","display_name":"Jiguang Wan","orcid":"https://orcid.org/0000-0003-3440-4460"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wan, Jiguang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125573608","display_name":"Jianzong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jianzong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9821000099182129,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9821000099182129,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.004699999932199717,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.0013000000035390258,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6428999900817871},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5608000159263611},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.5375000238418579},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5250999927520752},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5238999724388123},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.5210999846458435},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.5187000036239624},{"id":"https://openalex.org/keywords/video-compression-picture-types","display_name":"Video compression picture types","score":0.49160000681877136},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.46129998564720154}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8726999759674072},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6428999900817871},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5608000159263611},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.5375000238418579},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5250999927520752},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5238999724388123},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.5210999846458435},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.5187000036239624},{"id":"https://openalex.org/C106030495","wikidata":"https://www.wikidata.org/wiki/Q1797012","display_name":"Video compression picture types","level":4,"score":0.49160000681877136},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.46129998564720154},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.45980000495910645},{"id":"https://openalex.org/C65483669","wikidata":"https://www.wikidata.org/wiki/Q3536669","display_name":"Video processing","level":2,"score":0.4221999943256378},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.4180000126361847},{"id":"https://openalex.org/C82687282","wikidata":"https://www.wikidata.org/wiki/Q66221","display_name":"Auxiliary memory","level":2,"score":0.41260001063346863},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.3806000053882599},{"id":"https://openalex.org/C23431618","wikidata":"https://www.wikidata.org/wiki/Q1404672","display_name":"Multiview Video Coding","level":4,"score":0.3734000027179718},{"id":"https://openalex.org/C108803254","wikidata":"https://www.wikidata.org/wiki/Q857512","display_name":"Smacker video","level":4,"score":0.3671000003814697},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36000001430511475},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.34049999713897705},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.337799996137619},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.29580000042915344},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2953999936580658},{"id":"https://openalex.org/C2776566319","wikidata":"https://www.wikidata.org/wiki/Q3495514","display_name":"Interactive video","level":2,"score":0.2930000126361847},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.29010000824928284},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.2849000096321106},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2784000039100647},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.25920000672340393},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.25850000977516174},{"id":"https://openalex.org/C151211776","wikidata":"https://www.wikidata.org/wiki/Q2778015","display_name":"Video capture","level":3,"score":0.25429999828338623}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.08448","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.08448","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.08448","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.08448","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.4624137580394745,"display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Streaming":[0],"video":[1,17,64,74,195],"question":[2],"answering":[3],"(Streaming":[4],"Video":[5],"QA)":[6],"poses":[7],"distinct":[8],"challenges":[9],"for":[10,61,120,192],"multimodal":[11],"large":[12],"language":[13],"models":[14],"(MLLMs),":[15],"as":[16],"frames":[18,94,126],"arrive":[19],"sequentially":[20],"and":[21,69,97,115,132,142,155,160],"user":[22],"queries":[23],"can":[24,80],"be":[25,81],"issued":[26],"at":[27],"arbitrary":[28],"time":[29],"points.":[30],"Existing":[31],"solutions":[32],"relying":[33],"on":[34,180],"fixed-size":[35],"memory":[36,46,119,176],"or":[37,45,175],"naive":[38],"compression":[39],"often":[40],"suffer":[41],"from":[42],"context":[43],"loss":[44],"overflow,":[47],"limiting":[48],"their":[49],"effectiveness":[50],"in":[51,83,117],"long-form,":[52],"real-time":[53],"scenarios.":[54],"We":[55],"present":[56],"Vista,":[57],"a":[58,111,150,164,189],"novel":[59],"framework":[60],"scene-aware":[62,87,103,134],"streaming":[63,194],"QA":[65],"that":[66,183],"enables":[67],"efficient":[68,121],"scalable":[70],"reasoning":[71,171],"over":[72],"continuous":[73],"streams.":[75],"The":[76],"innovation":[77],"of":[78,166],"Vista":[79,90,157,184],"summarized":[82],"three":[84],"aspects:":[85],"(1)":[86],"segmentation,":[88],"where":[89,105,136],"dynamically":[91],"clusters":[92],"incoming":[93],"into":[95,110,144],"temporally":[96],"visually":[98],"coherent":[99],"scene":[100,107],"units;":[101],"(2)":[102],"compression,":[104],"each":[106],"is":[108,158],"compressed":[109],"compact":[112],"token":[113],"representation":[114],"stored":[116],"GPU":[118],"index-based":[122],"retrieval,":[123],"while":[124],"full-resolution":[125],"are":[127,139],"offloaded":[128],"to":[129],"CPU":[130],"memory;":[131],"(3)":[133],"recall,":[135],"relevant":[137],"scenes":[138],"selectively":[140],"recalled":[141],"reintegrated":[143],"the":[145],"model":[146],"input":[147],"upon":[148],"receiving":[149],"query,":[151],"enabling":[152,169],"both":[153],"efficiency":[154],"completeness.":[156],"model-agnostic":[159],"integrates":[161],"seamlessly":[162],"with":[163],"variety":[165],"vision-language":[167],"backbones,":[168],"long-context":[170],"without":[172],"compromising":[173],"latency":[174],"efficiency.":[177],"Extensive":[178],"experiments":[179],"StreamingBench":[181],"demonstrate":[182],"achieves":[185],"state-of-the-art":[186],"performance,":[187],"establishing":[188],"strong":[190],"baseline":[191],"real-world":[193],"understanding.":[196]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-11T00:00:00"}
