{"id":"https://openalex.org/W7137823731","doi":"https://doi.org/10.1609/aaai.v40i9.37694","title":"Vista: Scene-Aware Optimization for Streaming Video Question Answering Under Post-Hoc Queries","display_name":"Vista: Scene-Aware Optimization for Streaming Video Question Answering Under Post-Hoc Queries","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7137823731","doi":"https://doi.org/10.1609/aaai.v40i9.37694"},"language":"en","primary_location":{"id":"doi:10.1609/aaai.v40i9.37694","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i9.37694","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37694/41656","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37694/41656","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Haocheng Lu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210105511","display_name":"Beacon Tech (Israel)","ror":"https://ror.org/00zbjch89","country_code":"IL","type":"company","lineage":["https://openalex.org/I4210105511"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Haocheng Lu","raw_affiliation_strings":["Huazhong University of Science and Technology\nPing An Technology (Shenzhen) Co., Ltd"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology\nPing An Technology (Shenzhen) Co., Ltd","institution_ids":["https://openalex.org/I4210105511"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Nan Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I4401726822","display_name":"Ping An (China)","ror":"https://ror.org/004yv2z91","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726822"]}],"countries":[],"is_corresponding":false,"raw_author_name":"Nan Zhang","raw_affiliation_strings":["Ping An Technology (Shenzhen) Co., Ltd"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Ping An Technology (Shenzhen) Co., Ltd","institution_ids":["https://openalex.org/I4401726822"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Wei Tao","orcid":null},"institutions":[{"id":"https://openalex.org/I4210105511","display_name":"Beacon Tech (Israel)","ror":"https://ror.org/00zbjch89","country_code":"IL","type":"company","lineage":["https://openalex.org/I4210105511"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Wei Tao","raw_affiliation_strings":["Huazhong University of Science and Technology\nPing An Technology (Shenzhen) Co., Ltd"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology\nPing An Technology (Shenzhen) Co., Ltd","institution_ids":["https://openalex.org/I4210105511"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Xiaoyang Qu","orcid":null},"institutions":[{"id":"https://openalex.org/I4401726822","display_name":"Ping An (China)","ror":"https://ror.org/004yv2z91","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726822"]}],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaoyang Qu","raw_affiliation_strings":["Ping An Technology (Shenzhen) Co., Ltd"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Ping An Technology (Shenzhen) Co., Ltd","institution_ids":["https://openalex.org/I4401726822"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Guokuan Li","orcid":null},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guokuan Li","raw_affiliation_strings":["Huazhong University of Science and Technology"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jiguang Wan","orcid":null},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiguang Wan","raw_affiliation_strings":["Huazhong University of Science and Technology"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"last","author":{"id":null,"display_name":"Jianzong Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I4401726822","display_name":"Ping An (China)","ror":"https://ror.org/004yv2z91","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726822"]}],"countries":[],"is_corresponding":false,"raw_author_name":"Jianzong Wang","raw_affiliation_strings":["Ping An Technology (Shenzhen) Co., Ltd"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Ping An Technology (Shenzhen) Co., Ltd","institution_ids":["https://openalex.org/I4401726822"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":9.8252,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.92521368,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":"40","issue":"9","first_page":"7539","last_page":"7547"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9864000082015991,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9864000082015991,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.0017999999690800905,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.001500000013038516,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6323999762535095},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5587000250816345},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.5386999845504761},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.5357999801635742},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5217000246047974},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.5188999772071838},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5149000287055969},{"id":"https://openalex.org/keywords/video-compression-picture-types","display_name":"Video compression picture types","score":0.5023000240325928},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.4691999852657318}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8720999956130981},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6323999762535095},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5587000250816345},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.5386999845504761},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.5357999801635742},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5217000246047974},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.5188999772071838},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5149000287055969},{"id":"https://openalex.org/C106030495","wikidata":"https://www.wikidata.org/wiki/Q1797012","display_name":"Video compression picture types","level":4,"score":0.5023000240325928},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.4691999852657318},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.4636000096797943},{"id":"https://openalex.org/C65483669","wikidata":"https://www.wikidata.org/wiki/Q3536669","display_name":"Video processing","level":2,"score":0.42179998755455017},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.4171999990940094},{"id":"https://openalex.org/C82687282","wikidata":"https://www.wikidata.org/wiki/Q66221","display_name":"Auxiliary memory","level":2,"score":0.41429999470710754},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.39239999651908875},{"id":"https://openalex.org/C23431618","wikidata":"https://www.wikidata.org/wiki/Q1404672","display_name":"Multiview Video Coding","level":4,"score":0.37229999899864197},{"id":"https://openalex.org/C108803254","wikidata":"https://www.wikidata.org/wiki/Q857512","display_name":"Smacker video","level":4,"score":0.37229999899864197},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3707999885082245},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.3377000093460083},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.3345000147819519},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3010999858379364},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.2994000017642975},{"id":"https://openalex.org/C2776566319","wikidata":"https://www.wikidata.org/wiki/Q3495514","display_name":"Interactive video","level":2,"score":0.2985999882221222},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2867000102996826},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.28600001335144043},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2685999870300293},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.26109999418258667},{"id":"https://openalex.org/C151211776","wikidata":"https://www.wikidata.org/wiki/Q2778015","display_name":"Video capture","level":3,"score":0.25780001282691956},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.2563000023365021},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.25540000200271606}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1609/aaai.v40i9.37694","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i9.37694","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37694/41656","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},{"id":"pmh:oai:ojs.aaai.org:article/37694","is_oa":false,"landing_page_url":"https://ojs.aaai.org/index.php/AAAI/article/view/37694","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"2159-5399","raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i9.37694","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i9.37694","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/37694/41656","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.4609675407409668,"display_name":"Industry, innovation and infrastructure"}],"awards":[{"id":"https://openalex.org/G8489799302","display_name":null,"funder_award_id":"2023YFB4502701","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"}],"funders":[{"id":"https://openalex.org/F4320324174","display_name":"Natural Science Foundation of Shandong Province","ror":null},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7137823731.pdf","grobid_xml":"https://content.openalex.org/works/W7137823731.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Streaming":[0],"video":[1,17,62,72,191],"question":[2],"answering":[3],"(Streaming":[4],"Video":[5],"QA)":[6],"poses":[7],"distinct":[8],"challenges":[9],"for":[10,59,116,188],"multimodal":[11],"large":[12],"language":[13],"models":[14],"(MLLMs),":[15],"as":[16],"frames":[18,91,123],"arrive":[19],"sequentially":[20],"and":[21,67,94,111,141,151,156],"user":[22],"queries":[23],"can":[24,78],"be":[25,79],"issued":[26],"at":[27],"arbitrary":[28],"timepoints.":[29],"Existing":[30],"solutions":[31],"relying":[32],"on":[33,176],"fixed-size":[34],"memory":[35,45,115,172],"or":[36,44,171],"naive":[37],"compression":[38],"often":[39],"suffer":[40],"from":[41],"context":[42],"loss":[43],"overflow,":[46],"limiting":[47],"their":[48],"effectiveness":[49],"in":[50,81,113],"long-form,":[51],"real-time":[52],"scenarios.We":[53],"present":[54],"Vista,":[55],"a":[56,107,134,160,185],"novel":[57],"framework":[58],"scene-aware":[60],"streaming":[61,190],"QA":[63],"that":[64,179],"enables":[65],"efficient":[66,117],"scalable":[68],"reasoning":[69,167],"over":[70],"continuous":[71],"streams.":[73],"The":[74],"innovation":[75],"of":[76,162],"Vista":[77,87,153,180],"summarized":[80],"three":[82],"aspects:":[83],"(1)":[84],"Scene-aware":[85,100,130],"segmentation.":[86],"dynamically":[88],"clusters":[89],"incoming":[90],"into":[92,106,143],"temporally":[93],"visually":[95],"coherent":[96],"scene":[97,103],"units.":[98],"(2)":[99],"compression.":[101],"Each":[102],"is":[104,154],"compressed":[105],"compact":[108],"token":[109],"representation":[110],"stored":[112],"GPU":[114],"index-based":[118],"retrieval,":[119],"while":[120],"the":[121,144],"full-resolution":[122],"are":[124,138],"offloaded":[125],"to":[126],"CPU":[127],"memory.":[128],"(3)":[129],"recall.":[131],"Upon":[132],"receiving":[133],"question,":[135],"relevant":[136],"scenes":[137],"selectively":[139],"recalled":[140],"reintegrated":[142],"model\u2019s":[145],"input":[146],"space,":[147],"enabling":[148,165],"both":[149],"efficiency":[150],"completeness.":[152],"model-agnostic":[155],"integrates":[157],"seamlessly":[158],"with":[159],"variety":[161],"vision-language":[163],"backbones,":[164],"long-context":[166],"without":[168],"compromising":[169],"latency":[170],"efficiency.":[173],"Extensive":[174],"experiments":[175],"StreamingBench":[177],"demonstrate":[178],"achieves":[181],"state-of-the-art":[182],"performance,":[183],"establishing":[184],"strong":[186],"baseline":[187],"real-world":[189],"understanding.":[192]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-12T00:00:00"}
