{"id":"https://openalex.org/W7147613864","doi":"https://doi.org/10.48550/arxiv.2603.27593","title":"STRIDE: When to Speak Meets Sequence Denoising for Streaming Video Understanding","display_name":"STRIDE: When to Speak Meets Sequence Denoising for Streaming Video Understanding","publication_year":2026,"publication_date":"2026-03-29","ids":{"openalex":"https://openalex.org/W7147613864","doi":"https://doi.org/10.48550/arxiv.2603.27593"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.27593","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.27593","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.27593","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132692151","display_name":"Junho Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kim, Junho","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043134790","display_name":"Hosu Lee","orcid":"https://orcid.org/0000-0002-8702-5993"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Hosu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132611027","display_name":"James M. Rehg","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rehg, James M.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132624899","display_name":"Minsu Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Minsu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132648814","display_name":"Yong Man Ro","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ro, Yong Man","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5132692151"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.3723999857902527,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.3723999857902527,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.11980000138282776,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.11140000075101852,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.48260000348091125},{"id":"https://openalex.org/keywords/sliding-window-protocol","display_name":"Sliding window protocol","score":0.438400000333786},{"id":"https://openalex.org/keywords/video-tracking","display_name":"Video tracking","score":0.3774999976158142},{"id":"https://openalex.org/keywords/video-quality","display_name":"Video quality","score":0.37619999051094055},{"id":"https://openalex.org/keywords/video-streaming","display_name":"Video streaming","score":0.3531000018119812},{"id":"https://openalex.org/keywords/interface","display_name":"Interface (matter)","score":0.35109999775886536},{"id":"https://openalex.org/keywords/online-video","display_name":"Online video","score":0.35019999742507935},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.35010001063346863}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8429999947547913},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.48260000348091125},{"id":"https://openalex.org/C102392041","wikidata":"https://www.wikidata.org/wiki/Q592860","display_name":"Sliding window protocol","level":3,"score":0.438400000333786},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.41269999742507935},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.40459999442100525},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.3774999976158142},{"id":"https://openalex.org/C103910844","wikidata":"https://www.wikidata.org/wiki/Q2631256","display_name":"Video quality","level":3,"score":0.37619999051094055},{"id":"https://openalex.org/C2986160907","wikidata":"https://www.wikidata.org/wiki/Q220499","display_name":"Video streaming","level":2,"score":0.3531000018119812},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.35109999775886536},{"id":"https://openalex.org/C2988167200","wikidata":"https://www.wikidata.org/wiki/Q16885149","display_name":"Online video","level":2,"score":0.35019999742507935},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.35010001063346863},{"id":"https://openalex.org/C2777611316","wikidata":"https://www.wikidata.org/wiki/Q39045282","display_name":"Streaming data","level":2,"score":0.3411000072956085},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.32839998602867126},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.325300008058548},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3025999963283539},{"id":"https://openalex.org/C65483669","wikidata":"https://www.wikidata.org/wiki/Q3536669","display_name":"Video processing","level":2,"score":0.2989000082015991},{"id":"https://openalex.org/C18007350","wikidata":"https://www.wikidata.org/wiki/Q7394815","display_name":"STRIDE","level":2,"score":0.2904999852180481},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2904999852180481},{"id":"https://openalex.org/C2778751112","wikidata":"https://www.wikidata.org/wiki/Q835016","display_name":"Window (computing)","level":2,"score":0.28519999980926514},{"id":"https://openalex.org/C2777851325","wikidata":"https://www.wikidata.org/wiki/Q7094102","display_name":"Online model","level":2,"score":0.28189998865127563},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.27570000290870667},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.26269999146461487},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.259799987077713},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2587999999523163},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.27593","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.27593","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.27593","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.27593","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.6887125968933105,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"progress":[1],"in":[2,55,71,158],"video":[3,29,57,73],"large":[4],"language":[5],"models":[6,141],"(Video-LLMs)":[7],"has":[8],"enabled":[9],"strong":[10],"offline":[11],"reasoning":[12],"over":[13,89],"long":[14],"and":[15,25,33,94,125,139,148],"complex":[16],"videos.":[17],"However,":[18],"real-world":[19],"deployments":[20],"increasingly":[21],"require":[22],"streaming":[23,56,72,137,160],"perception":[24],"proactive":[26,53,151],"interaction,":[27],"where":[28],"frames":[30,100],"arrive":[31],"online":[32,159],"the":[34,66,119,131],"system":[35],"must":[36],"decide":[37],"not":[38],"only":[39],"what":[40],"to":[41,46,122],"respond,":[42],"but":[43],"also":[44],"when":[45],"respond.":[47],"In":[48],"this":[49,81],"work,":[50],"we":[51,84],"revisit":[52],"activation":[54,77,86,120,128],"as":[58,98],"a":[59,90,113],"structured":[60],"sequence":[61],"modeling":[62],"problem,":[63],"motivated":[64],"by":[65],"observation":[67],"that":[68,143],"temporal":[69,92],"transitions":[70],"naturally":[74],"form":[75],"span-structured":[76],"patterns.":[78],"To":[79],"capture":[80],"span-level":[82],"structure,":[83],"model":[85],"signals":[87,129],"jointly":[88,123],"sliding":[91],"window":[93],"update":[95],"them":[96],"iteratively":[97],"new":[99],"arrive.":[101],"We":[102],"propose":[103],"STRIDE":[104,144],"(Structured":[105],"Temporal":[106],"Refinement":[107],"with":[108],"Iterative":[109],"DEnoising),":[110],"which":[111],"employs":[112],"lightweight":[114],"masked":[115],"diffusion":[116],"module":[117],"at":[118],"interface":[121],"predict":[124],"progressively":[126],"refine":[127],"across":[130],"window.":[132],"Extensive":[133],"experiments":[134],"on":[135],"diverse":[136],"benchmarks":[138],"downstream":[140],"demonstrate":[142],"shows":[145],"more":[146],"reliable":[147],"temporally":[149],"coherent":[150],"responses,":[152],"significantly":[153],"improving":[154],"when-to-speak":[155],"decision":[156],"quality":[157],"scenarios.":[161]},"counts_by_year":[],"updated_date":"2026-04-02T13:53:19.096889","created_date":"2026-04-02T00:00:00"}
