{"id":"https://openalex.org/W7154332149","doi":"https://doi.org/10.48550/arxiv.2604.11102","title":"OmniScript: Towards Audio-Visual Script Generation for Long-Form Cinematic Video","display_name":"OmniScript: Towards Audio-Visual Script Generation for Long-Form Cinematic Video","publication_year":2026,"publication_date":"2026-04-13","ids":{"openalex":"https://openalex.org/W7154332149","doi":"https://doi.org/10.48550/arxiv.2604.11102"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.11102","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11102","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.11102","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133562900","display_name":"Junfu Pu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Pu, Junfu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133610777","display_name":"Yuxin Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yuxin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133565489","display_name":"Teng Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Teng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133586255","display_name":"Ying Shan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shan, Ying","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5133562900"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8648999929428101,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8648999929428101,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.10719999670982361,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0027000000700354576,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scripting-language","display_name":"Scripting language","score":0.8406999707221985},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6097000241279602},{"id":"https://openalex.org/keywords/character","display_name":"Character (mathematics)","score":0.6014999747276306},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.5598000288009644},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4846999943256378},{"id":"https://openalex.org/keywords/narrative","display_name":"Narrative","score":0.4690000116825104},{"id":"https://openalex.org/keywords/plot","display_name":"Plot (graphics)","score":0.45170000195503235}],"concepts":[{"id":"https://openalex.org/C61423126","wikidata":"https://www.wikidata.org/wiki/Q187432","display_name":"Scripting language","level":2,"score":0.8406999707221985},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7972999811172485},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6097000241279602},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.6014999747276306},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5831000208854675},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.5598000288009644},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4846999943256378},{"id":"https://openalex.org/C199033989","wikidata":"https://www.wikidata.org/wiki/Q1318295","display_name":"Narrative","level":2,"score":0.4690000116825104},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4657000005245209},{"id":"https://openalex.org/C167651023","wikidata":"https://www.wikidata.org/wiki/Q1474611","display_name":"Plot (graphics)","level":2,"score":0.45170000195503235},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.42590001225471497},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.31839999556541443},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.3176000118255615},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.31189998984336853},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.29100000858306885},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2840000092983246},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.2651999890804291},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.25459998846054077}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.11102","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11102","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.11102","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11102","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Current":[0],"multimodal":[1],"large":[2],"language":[3,74],"models":[4,119],"(MLLMs)":[5],"have":[6],"demonstrated":[7],"remarkable":[8],"capabilities":[9],"in":[10,131],"short-form":[11],"video":[12],"understanding,":[13],"yet":[14],"translating":[15],"long-form":[16,78],"cinematic":[17],"videos":[18],"into":[19],"detailed,":[20],"temporally":[21,103],"grounded":[22],"scripts":[23,41],"remains":[24],"a":[25,55,61,85],"significant":[26],"challenge.":[27],"This":[28],"paper":[29],"introduces":[30],"the":[31],"novel":[32],"video-to-script":[33],"(V2S)":[34],"task,":[35],"aiming":[36],"to":[37,124],"generate":[38],"hierarchical,":[39],"scene-by-scene":[40],"encompassing":[42],"character":[43,96],"actions,":[44],"dialogues,":[45],"expressions,":[46],"and":[47,59,95,120,135],"audio":[48],"cues.":[49],"To":[50],"facilitate":[51],"this,":[52],"we":[53,67],"construct":[54],"first-of-its-kind":[56],"human-annotated":[57],"benchmark":[58],"propose":[60],"temporally-aware":[62],"hierarchical":[63],"evaluation":[64],"framework.":[65],"Furthermore,":[66],"present":[68],"OmniScript,":[69],"an":[70],"8B-parameter":[71],"omni-modal":[72],"(audio-visual)":[73],"model":[75],"tailored":[76],"for":[77,93],"narrative":[79],"comprehension.":[80],"OmniScript":[81,114],"is":[82],"trained":[83],"via":[84],"progressive":[86],"pipeline":[87],"that":[88,109],"leverages":[89],"chain-of-thought":[90],"supervised":[91],"fine-tuning":[92],"plot":[94],"reasoning,":[97],"followed":[98],"by":[99],"reinforcement":[100],"learning":[101],"using":[102],"segmented":[104],"rewards.":[105],"Extensive":[106],"experiments":[107],"demonstrate":[108],"despite":[110],"its":[111],"parameter":[112],"efficiency,":[113],"significantly":[115],"outperforms":[116],"larger":[117],"open-source":[118],"achieves":[121],"performance":[122],"comparable":[123],"state-of-the-art":[125],"proprietary":[126],"models,":[127],"including":[128],"Gemini":[129],"3-Pro,":[130],"both":[132],"temporal":[133],"localization":[134],"multi-field":[136],"semantic":[137],"accuracy.":[138]},"counts_by_year":[],"updated_date":"2026-04-15T06:04:33.058270","created_date":"2026-04-15T00:00:00"}
