{"id":"https://openalex.org/W7123609658","doi":"https://doi.org/10.48550/arxiv.2601.07366","title":"HiVid-Narrator: Hierarchical Video Narrative Generation with Scene-Primed ASR-anchored Compression","display_name":"HiVid-Narrator: Hierarchical Video Narrative Generation with Scene-Primed ASR-anchored Compression","publication_year":2026,"publication_date":"2026-01-12","ids":{"openalex":"https://openalex.org/W7123609658","doi":"https://doi.org/10.48550/arxiv.2601.07366"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.07366","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.07366","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.07366","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5122982128","display_name":"Haoxuan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Haoxuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122980032","display_name":"Mengyan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Mengyan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5102516938","display_name":"Junjun Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Junjun","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5122982128"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9509000182151794,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9509000182151794,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.014800000004470348,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.010999999940395355,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.8233000040054321},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.6195999979972839},{"id":"https://openalex.org/keywords/narrative","display_name":"Narrative","score":0.5285000205039978},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4381999969482422},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.428600013256073},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.41179999709129333}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.826200008392334},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.8233000040054321},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.6195999979972839},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5339000225067139},{"id":"https://openalex.org/C199033989","wikidata":"https://www.wikidata.org/wiki/Q1318295","display_name":"Narrative","level":2,"score":0.5285000205039978},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4584999978542328},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4381999969482422},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.428600013256073},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.41179999709129333},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.39820000529289246},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3230000138282776},{"id":"https://openalex.org/C144986985","wikidata":"https://www.wikidata.org/wiki/Q871236","display_name":"Hierarchical database model","level":2,"score":0.3075999915599823},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2962999939918518},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.29580000042915344},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.26759999990463257},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.25920000672340393}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.07366","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.07366","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.07366","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.07366","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Generating":[0],"structured":[1],"narrations":[2],"for":[3],"real-world":[4],"e-commerce":[5,105],"videos":[6,106],"requires":[7],"models":[8],"to":[9,25,164],"perceive":[10],"fine-grained":[11],"visual":[12,74,112],"details":[13],"and":[14,48,73,79,90,109,140],"organize":[15],"them":[16,53],"into":[17,54,86,137],"coherent,":[18],"high-level":[19],"stories--capabilities":[20],"that":[21,44,51,68,104],"existing":[22,165],"approaches":[23],"struggle":[24],"unify.":[26],"We":[27,101],"introduce":[28],"the":[29,94,115,128],"E-commerce":[30],"Hierarchical":[31],"Video":[32],"Captioning":[33],"(E-HVC)":[34],"dataset":[35],"with":[36,111,159],"dual-granularity,":[37],"temporally":[38],"grounded":[39],"annotations:":[40],"a":[41,65],"Temporal":[42,95],"Chain-of-Thought":[43],"anchors":[45],"event-level":[46],"observations":[47],"Chapter":[49],"Summary":[50],"compose":[52],"concise,":[55],"story-centric":[56],"summaries.":[57],"Rather":[58],"than":[59],"directly":[60],"prompting":[61],"chapters,":[62],"we":[63,126],"adopt":[64],"staged":[66],"construction":[67],"first":[69],"gathers":[70],"reliable":[71],"linguistic":[72],"evidence":[75],"via":[76],"curated":[77],"ASR":[78,145],"frame-level":[80],"descriptions,":[81],"then":[82],"refines":[83],"coarse":[84],"annotations":[85],"precise":[87],"chapter":[88],"boundaries":[89],"titles":[91],"conditioned":[92],"on":[93],"Chain-of-Thought,":[96],"yielding":[97],"fact-grounded,":[98],"time-aligned":[99],"narratives.":[100],"also":[102],"observe":[103],"are":[107],"fast-paced":[108],"information-dense,":[110],"tokens":[113,136,162],"dominating":[114],"input":[116,124,161],"sequence.":[117],"To":[118],"enable":[119],"efficient":[120],"training":[121],"while":[122],"reducing":[123],"tokens,":[125],"propose":[127],"Scene-Primed":[129],"ASR-anchored":[130],"Compressor":[131],"(SPA-Compressor),":[132],"which":[133],"compresses":[134],"multimodal":[135],"hierarchical":[138],"scene":[139],"event":[141],"representations":[142],"guided":[143],"by":[144],"semantic":[146],"cues.":[147],"Built":[148],"upon":[149],"these":[150],"designs,":[151],"our":[152],"HiVid-Narrator":[153],"framework":[154],"achieves":[155],"superior":[156],"narrative":[157],"quality":[158],"fewer":[160],"compared":[163],"methods.":[166]},"counts_by_year":[],"updated_date":"2026-01-14T23:44:37.837170","created_date":"2026-01-14T00:00:00"}
