{"id":"https://openalex.org/W4416798931","doi":"https://doi.org/10.1109/apsipaasc65261.2025.11249316","title":"Narrativity-Aware Video Summarization Based on Vision and Language Foundation Models","display_name":"Narrativity-Aware Video Summarization Based on Vision and Language Foundation Models","publication_year":2025,"publication_date":"2025-10-22","ids":{"openalex":"https://openalex.org/W4416798931","doi":"https://doi.org/10.1109/apsipaasc65261.2025.11249316"},"language":null,"primary_location":{"id":"doi:10.1109/apsipaasc65261.2025.11249316","is_oa":false,"landing_page_url":"https://doi.org/10.1109/apsipaasc65261.2025.11249316","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Shumpei Saito","orcid":null},"institutions":[{"id":"https://openalex.org/I22299242","display_name":"Kyoto University","ror":"https://ror.org/02kpeqv85","country_code":"JP","type":"education","lineage":["https://openalex.org/I22299242"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Shumpei Saito","raw_affiliation_strings":["Graduate School of Engineering, Kyoto University,Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Engineering, Kyoto University,Japan","institution_ids":["https://openalex.org/I22299242"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077739319","display_name":"Hiroyuki Ueda","orcid":"https://orcid.org/0000-0002-3938-2208"},"institutions":[{"id":"https://openalex.org/I22299242","display_name":"Kyoto University","ror":"https://ror.org/02kpeqv85","country_code":"JP","type":"education","lineage":["https://openalex.org/I22299242"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Hiroyuki Ueda","raw_affiliation_strings":["Graduate School of Engineering, Kyoto University,Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Engineering, Kyoto University,Japan","institution_ids":["https://openalex.org/I22299242"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028906527","display_name":"Yosuke Ito","orcid":"https://orcid.org/0009-0009-4257-7530"},"institutions":[{"id":"https://openalex.org/I22299242","display_name":"Kyoto University","ror":"https://ror.org/02kpeqv85","country_code":"JP","type":"education","lineage":["https://openalex.org/I22299242"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yosuke Ito","raw_affiliation_strings":["Graduate School of Engineering, Kyoto University,Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Engineering, Kyoto University,Japan","institution_ids":["https://openalex.org/I22299242"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5067956319","display_name":"Kazuyoshi Yoshii","orcid":"https://orcid.org/0000-0001-8387-8609"},"institutions":[{"id":"https://openalex.org/I22299242","display_name":"Kyoto University","ror":"https://ror.org/02kpeqv85","country_code":"JP","type":"education","lineage":["https://openalex.org/I22299242"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Kazuyoshi Yoshii","raw_affiliation_strings":["Graduate School of Engineering, Kyoto University,Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Engineering, Kyoto University,Japan","institution_ids":["https://openalex.org/I22299242"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I22299242"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.36633383,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1991","last_page":"1996"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9621000289916992,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9621000289916992,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.011099999770522118,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.006200000178068876,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/automatic-summarization","display_name":"Automatic summarization","score":0.9390000104904175},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.5777999758720398},{"id":"https://openalex.org/keywords/storytelling","display_name":"Storytelling","score":0.5623999834060669},{"id":"https://openalex.org/keywords/visual-language","display_name":"Visual language","score":0.5376999974250793},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.45719999074935913},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.4431000053882599},{"id":"https://openalex.org/keywords/salience","display_name":"Salience (neuroscience)","score":0.43130001425743103},{"id":"https://openalex.org/keywords/foundation","display_name":"Foundation (evidence)","score":0.41999998688697815},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4171000123023987}],"concepts":[{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.9390000104904175},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8151000142097473},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.5777999758720398},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5672000050544739},{"id":"https://openalex.org/C2776538412","wikidata":"https://www.wikidata.org/wiki/Q989963","display_name":"Storytelling","level":3,"score":0.5623999834060669},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.5376999974250793},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4652000069618225},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.45719999074935913},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.4431000053882599},{"id":"https://openalex.org/C108154423","wikidata":"https://www.wikidata.org/wiki/Q1469792","display_name":"Salience (neuroscience)","level":2,"score":0.43130001425743103},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.41999998688697815},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4171000123023987},{"id":"https://openalex.org/C199033989","wikidata":"https://www.wikidata.org/wiki/Q1318295","display_name":"Narrative","level":2,"score":0.3894999921321869},{"id":"https://openalex.org/C147168706","wikidata":"https://www.wikidata.org/wiki/Q1457734","display_name":"Recurrent neural network","level":3,"score":0.3257000148296356},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.31619998812675476},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.3028999865055084},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3019999861717224},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.29589998722076416},{"id":"https://openalex.org/C103910844","wikidata":"https://www.wikidata.org/wiki/Q2631256","display_name":"Video quality","level":3,"score":0.2953999936580658},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.2944999933242798},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2939000129699707},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2924000024795532},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.2849000096321106},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.27649998664855957},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2754000127315521},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.27379998564720154},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.27140000462532043}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/apsipaasc65261.2025.11249316","is_oa":false,"landing_page_url":"https://doi.org/10.1109/apsipaasc65261.2025.11249316","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W1522734439","https://openalex.org/W1924343884","https://openalex.org/W2097117768","https://openalex.org/W2120588093","https://openalex.org/W2529272619","https://openalex.org/W2737677090","https://openalex.org/W2883429621","https://openalex.org/W2902616437","https://openalex.org/W2963524571","https://openalex.org/W2963919999","https://openalex.org/W2964167369","https://openalex.org/W3210314917","https://openalex.org/W4253028564","https://openalex.org/W4385245566","https://openalex.org/W4402702960","https://openalex.org/W4413146028"],"related_works":[],"abstract_inverted_index":{"This":[0],"paper":[1],"presents":[2],"a":[3,55,86,105,111,116,136,143],"novel":[4],"video":[5,16,25,57,185],"summarization":[6,26,58],"approach":[7],"that":[8,60],"prioritizes":[9],"the":[10,14,40,129,166,170,181],"narrative":[11],"quality":[12],"of":[13,172,183],"summarized":[15],"to":[17,43,67,90,114,148,159],"enhance":[18],"its":[19],"enjoyment":[20],"and":[21,63,79,154,180],"appeal.":[22],"While":[23],"most":[24],"studies":[27],"focus":[28],"on":[29,47,75,165],"extracting":[30],"salient":[31],"scenes":[32],"using":[33],"lowlevel":[34],"visual":[35,77,133],"features,":[36],"they":[37],"often":[38],"neglect":[39],"storytelling":[41],"aspect":[42],"optimize":[44],"numerical":[45],"performance":[46,178],"standard":[48],"benchmarks.":[49,163],"To":[50],"address":[51],"this,":[52],"we":[53],"propose":[54],"multifaceted":[56,174],"method":[59,84],"leverages":[61],"vision":[62,137],"language":[64,107],"foundation":[65,138],"models":[66],"assess":[68],"shot-level":[69],"importance":[70,150],"(e.g.,":[71],"2-sec":[72],"intervals)":[73],"based":[74],"both":[76],"salience":[78],"textual":[80,99],"narrativity.":[81,122],"Specifically,":[82],"our":[83,173],"employs":[85],"vision-language":[87],"model":[88,108],"(VLM)":[89],"generate":[91],"objective":[92],"captions":[93],"for":[94],"individual":[95],"shots.":[96],"These":[97],"shot-wise":[98],"descriptions":[100],"are":[101,140,156],"then":[102],"fed":[103],"into":[104],"large":[106],"(LLM)":[109],"with":[110,120,132,161],"prompt":[112],"designed":[113],"produce":[115],"semantically-coherent":[117],"text":[118,125],"summary":[119],"strong":[121],"The":[123,152],"narrativity-aware":[124],"embeddings":[126,134],"obtained":[127],"by":[128,142],"LLM,":[130],"combined":[131],"from":[135],"model,":[139],"processed":[141],"recurrent":[144],"neural":[145],"network":[146],"(RNN)":[147],"predict":[149],"scores.":[151],"LLM":[153],"RNN":[155],"jointly":[157],"fine-tuned":[158],"align":[160],"existing":[162],"Experiments":[164],"SumMe":[167],"benchmark":[168],"demonstrated":[169],"effectiveness":[171],"approach,":[175],"highlighting":[176],"significant":[177],"improvements":[179],"potential":[182],"text-domain":[184],"summarization.":[186]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-11-28T00:00:00"}
