{"id":"https://openalex.org/W4415202952","doi":"https://doi.org/10.1109/iccv51701.2025.01957","title":"Flash-Vstream: Efficient Real-Time Understanding for Long Video Streams","display_name":"Flash-Vstream: Efficient Real-Time Understanding for Long Video Streams","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4415202952","doi":"https://doi.org/10.1109/iccv51701.2025.01957"},"language":"en","primary_location":{"id":"doi:10.1109/iccv51701.2025.01957","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.01957","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2506.23825","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5112582535","display_name":"Haoji Zhang","orcid":"https://orcid.org/0009-0000-1467-1719"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Haoji Zhang","raw_affiliation_strings":["Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100662110","display_name":"Yiqin Wang","orcid":"https://orcid.org/0000-0003-4714-774X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yiqin Wang","raw_affiliation_strings":["Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101926295","display_name":"Yansong Tang","orcid":"https://orcid.org/0000-0002-8768-8498"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yansong Tang","raw_affiliation_strings":["Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100724297","display_name":"Yong Liu","orcid":"https://orcid.org/0000-0003-4822-8939"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yong Liu","raw_affiliation_strings":["Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100668696","display_name":"Jiashi Feng","orcid":"https://orcid.org/0000-0001-6843-0064"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiashi Feng","raw_affiliation_strings":["ByteDance Inc"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance Inc","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101828825","display_name":"Xiaojie Jin","orcid":"https://orcid.org/0000-0003-2789-0923"},"institutions":[{"id":"https://openalex.org/I21193070","display_name":"Beijing Jiaotong University","ror":"https://ror.org/01yj56c84","country_code":"CN","type":"education","lineage":["https://openalex.org/I21193070"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaojie Jin","raw_affiliation_strings":["Beijing Jiaotong University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beijing Jiaotong University","institution_ids":["https://openalex.org/I21193070"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5112582535"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.25980831,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"21059","last_page":"21069"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11165","display_name":"Image and Video Quality Assessment","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10741","display_name":"Video Coding and Compression Technologies","score":0.9951000213623047,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/aggregate","display_name":"Aggregate (composite)","score":0.6134999990463257},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5214999914169312},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4909999966621399},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.44920000433921814},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.37130001187324524},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.3490000069141388},{"id":"https://openalex.org/keywords/auxiliary-memory","display_name":"Auxiliary memory","score":0.33869999647140503},{"id":"https://openalex.org/keywords/video-processing","display_name":"Video processing","score":0.32679998874664307}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8655999898910522},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.6134999990463257},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5214999914169312},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4909999966621399},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.44920000433921814},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39259999990463257},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.37130001187324524},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3490000069141388},{"id":"https://openalex.org/C82687282","wikidata":"https://www.wikidata.org/wiki/Q66221","display_name":"Auxiliary memory","level":2,"score":0.33869999647140503},{"id":"https://openalex.org/C65483669","wikidata":"https://www.wikidata.org/wiki/Q3536669","display_name":"Video processing","level":2,"score":0.32679998874664307},{"id":"https://openalex.org/C89198739","wikidata":"https://www.wikidata.org/wiki/Q3079880","display_name":"Data stream mining","level":2,"score":0.3190999925136566},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3156000077724457},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.31150001287460327},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2822999954223633},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2809000015258789},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.2678000032901764},{"id":"https://openalex.org/C87868495","wikidata":"https://www.wikidata.org/wiki/Q750843","display_name":"Information processing","level":2,"score":0.26589998602867126},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.26489999890327454},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2635999917984009},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2630000114440918},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.2597000002861023},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.2547999918460846}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/iccv51701.2025.01957","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.01957","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2506.23825","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.23825","pdf_url":"https://arxiv.org/pdf/2506.23825","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2506.23825","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.23825","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2506.23825","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.23825","pdf_url":"https://arxiv.org/pdf/2506.23825","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Benefiting":[0],"from":[1],"the":[2,27,53,118,166],"advances":[3],"in":[4,20,40,52,96,145],"large":[5,13],"language":[6,14,83],"models":[7,15],"and":[8,22,43,65,91,116,123,154,163,169],"cross-modal":[9],"alignment,":[10],"existing":[11,47,139],"multimodal":[12],"have":[16],"achieved":[17],"prominent":[18],"performance":[19,168],"image":[21],"short":[23,57],"video":[24,82,152,156],"understanding.":[25],"However,":[26],"understanding":[28],"of":[29,86,120,172],"long":[30,50,89,151],"videos":[31,51,90],"is":[32,60,176],"still":[33],"challenging,":[34],"as":[35,56],"their":[36],"long-context":[37,113],"nature":[38],"results":[39],"significant":[41,143],"computational":[42],"memory":[44,110,127],"overhead.":[45],"Most":[46],"work":[48],"treats":[49],"same":[54],"way":[55],"videos,":[58],"which":[59],"inefficient":[61],"for":[62],"real-world":[63],"applications":[64],"hard":[66],"to":[67,69,93,111,128,138],"generalize":[68],"even":[70],"longer":[71],"videos.":[72],"To":[73],"address":[74],"these":[75],"issues,":[76],"we":[77,100],"propose":[78],"Flash-VStream,":[79],"an":[80],"efficient":[81],"model":[84,117],"capable":[85],"processing":[87],"extremely":[88],"responding":[92],"user":[94],"queries":[95],"real":[97],"time.":[98],"Particularly,":[99],"design":[101],"a":[102,107,124],"Flash":[103],"Memory":[104],"module,":[105],"containing":[106],"low-capacity":[108],"context":[109],"aggregate":[112],"temporal":[114],"information":[115,121,132],"distribution":[119],"density,":[122],"high-capacity":[125],"augmentation":[126],"retrieve":[129],"detailed":[130],"spatial":[131],"based":[133],"on":[134,150],"this":[135],"distribution.":[136],"Compared":[137],"models,":[140],"Flash-VStream":[141],"achieves":[142],"reductions":[144],"inference":[146],"latency.":[147],"Extensive":[148],"experiments":[149],"benchmarks":[153],"comprehensive":[155],"benchmarks,":[157],"i.e.,":[158],"EgoSchema,":[159],"MLVU,":[160],"LVBench,":[161],"MVBench":[162],"Video-MME,":[164],"demonstrate":[165],"state-of-the-art":[167],"outstanding":[170],"efficiency":[171],"our":[173],"method.":[174],"Code":[175],"available":[177],"at":[178],"https://github.com/IVGSZ/Flash-VStream.":[179]},"counts_by_year":[],"updated_date":"2026-05-06T06:03:25.996018","created_date":"2025-10-16T00:00:00"}
