{"id":"https://openalex.org/W7141643304","doi":"https://doi.org/10.48550/arxiv.2603.25746","title":"ShotStream: Streaming Multi-Shot Video Generation for Interactive Storytelling","display_name":"ShotStream: Streaming Multi-Shot Video Generation for Interactive Storytelling","publication_year":2026,"publication_date":"2026-03-26","ids":{"openalex":"https://openalex.org/W7141643304","doi":"https://doi.org/10.48550/arxiv.2603.25746"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.25746","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25746","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.25746","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100767840","display_name":"Yawen Luo","orcid":"https://orcid.org/0009-0007-3716-8914"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Luo, Yawen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130794432","display_name":"Xiaoyu Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Xiaoyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130746417","display_name":"Junhao Zhuang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhuang, Junhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074021977","display_name":"Yutian Chen","orcid":"https://orcid.org/0000-0001-8008-9014"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yutian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130782016","display_name":"Quande Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Quande","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130789042","display_name":"Xintao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xintao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130783984","display_name":"Pengfei Wan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wan, Pengfei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130738998","display_name":"Tianfan Xue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Tianfan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5100767840"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.24150000512599945,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.24150000512599945,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.1891999989748001,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11574","display_name":"Artificial Intelligence in Games","score":0.11150000244379044,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4659000039100647},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.460099995136261},{"id":"https://openalex.org/keywords/interactivity","display_name":"Interactivity","score":0.4366999864578247},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3774999976158142},{"id":"https://openalex.org/keywords/interactive-storytelling","display_name":"Interactive storytelling","score":0.37400001287460327},{"id":"https://openalex.org/keywords/rendering","display_name":"Rendering (computer graphics)","score":0.3677000105381012},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.3310999870300293},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.3301999866962433},{"id":"https://openalex.org/keywords/storytelling","display_name":"Storytelling","score":0.3052000105381012}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8551999926567078},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4659000039100647},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.460099995136261},{"id":"https://openalex.org/C144430266","wikidata":"https://www.wikidata.org/wiki/Q839721","display_name":"Interactivity","level":2,"score":0.4366999864578247},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3894999921321869},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3774999976158142},{"id":"https://openalex.org/C2779754051","wikidata":"https://www.wikidata.org/wiki/Q2903135","display_name":"Interactive storytelling","level":4,"score":0.37400001287460327},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.3677000105381012},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.35350000858306885},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.3310999870300293},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3301999866962433},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3296000063419342},{"id":"https://openalex.org/C2776538412","wikidata":"https://www.wikidata.org/wiki/Q989963","display_name":"Storytelling","level":3,"score":0.3052000105381012},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.2870999872684479},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.2849999964237213},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.28450000286102295},{"id":"https://openalex.org/C52723943","wikidata":"https://www.wikidata.org/wiki/Q1127410","display_name":"Serialization","level":2,"score":0.28369998931884766},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.2815000116825104},{"id":"https://openalex.org/C2781020372","wikidata":"https://www.wikidata.org/wiki/Q533093","display_name":"On the fly","level":2,"score":0.26919999718666077},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.26809999346733093},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2671000063419342},{"id":"https://openalex.org/C2780787791","wikidata":"https://www.wikidata.org/wiki/Q6966232","display_name":"Narrativity","level":3,"score":0.2669000029563904},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.265500009059906},{"id":"https://openalex.org/C187691185","wikidata":"https://www.wikidata.org/wiki/Q2020720","display_name":"Grid","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C28034677","wikidata":"https://www.wikidata.org/wiki/Q17092530","display_name":"Interleaving","level":2,"score":0.2574000060558319},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.25746","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25746","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.25746","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25746","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.4600965976715088,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multi-shot":[0],"video":[1],"generation":[2,43],"is":[3,74,142],"crucial":[4],"for":[5,119,134,220],"long":[6],"narrative":[7],"storytelling,":[8],"yet":[9],"current":[10,132],"bidirectional":[11,70,215],"architectures":[12],"suffer":[13],"from":[14],"limited":[15],"interactivity":[16],"and":[17,32,92,174,225],"high":[18],"latency.":[19],"We":[20,59],"propose":[21,159],"ShotStream,":[22],"a":[23,65,69,78,105,112,123,138,160,204],"novel":[24],"causal":[25,79],"multi-shot":[26,195],"architecture":[27],"that":[28,191],"enables":[29],"interactive":[30,222],"storytelling":[31],"efficient":[33],"on-the-fly":[34],"frame":[35],"generation.":[36],"By":[37],"reformulating":[38],"the":[39,87,131,147,185,211,218,231],"task":[40],"as":[41,228,230],"next-shot":[42,71],"conditioned":[44,169],"on":[45,170,203,235],"historical":[46,172],"context,":[47],"ShotStream":[48,192],"allows":[49],"users":[50],"to":[51,144,150,154,177],"dynamically":[52],"instruct":[53],"ongoing":[54],"narratives":[55],"via":[56,81],"streaming":[57],"prompts.":[58],"achieve":[60],"this":[61],"by":[62],"first":[63],"fine-tuning":[64],"text-to-video":[66],"model":[67],"into":[68,77],"generator,":[72],"which":[73],"then":[75],"distilled":[76],"student":[80],"Distribution":[82],"Matching":[83],"Distillation.":[84],"To":[85],"overcome":[86],"challenges":[88],"of":[89,213],"inter-shot":[90,120,178],"consistency":[91],"error":[93,156],"accumulation":[94],"inherent":[95],"in":[96],"autoregressive":[97],"generation,":[98],"we":[99,158],"introduce":[100],"two":[101,148],"key":[102],"innovations.":[103],"First,":[104],"dual-cache":[106],"memory":[107],"mechanism":[108],"preserves":[109],"visual":[110],"coherence:":[111],"global":[113],"context":[114,125],"cache":[115,126],"retains":[116],"conditional":[117],"frames":[118,129],"consistency,":[121],"while":[122],"local":[124],"holds":[127],"generated":[128],"within":[130],"shot":[133],"intra-shot":[135,167],"consistency.":[136],"And":[137],"RoPE":[139],"discontinuity":[140],"indicator":[141],"employed":[143],"explicitly":[145],"distinguish":[146],"caches":[149],"eliminate":[151],"ambiguity.":[152],"Second,":[153],"mitigate":[155],"accumulation,":[157],"two-stage":[161],"distillation":[162],"strategy.":[163],"This":[164],"begins":[165],"with":[166,197],"self-forcing":[168,179],"ground-truth":[171],"shots":[173],"progressively":[175],"extends":[176],"using":[180],"self-generated":[181],"histories,":[182],"effectively":[183],"bridging":[184],"train-test":[186],"gap.":[187],"Extensive":[188],"experiments":[189],"demonstrate":[190],"generates":[193],"coherent":[194],"videos":[196],"sub-second":[198],"latency,":[199],"achieving":[200],"16":[201],"FPS":[202],"single":[205],"GPU.":[206],"It":[207],"matches":[208],"or":[209],"exceeds":[210],"quality":[212],"slower":[214],"models,":[216,232],"paving":[217],"way":[219],"real-time":[221],"storytelling.":[223],"Training":[224],"inference":[226],"code,":[227],"well":[229],"are":[233],"available":[234],"our":[236]},"counts_by_year":[],"updated_date":"2026-03-28T06:16:51.555046","created_date":"2026-03-28T00:00:00"}
