{"id":"https://openalex.org/W4416183470","doi":"https://doi.org/10.1109/sibgrapi67909.2025.11223503","title":"SkimCap: A Transformer-Based Video Captioning Method with Adaptive Attention and Hierarchical Skimming Features","display_name":"SkimCap: A Transformer-Based Video Captioning Method with Adaptive Attention and Hierarchical Skimming Features","publication_year":2025,"publication_date":"2025-09-30","ids":{"openalex":"https://openalex.org/W4416183470","doi":"https://doi.org/10.1109/sibgrapi67909.2025.11223503"},"language":null,"primary_location":{"id":"doi:10.1109/sibgrapi67909.2025.11223503","is_oa":false,"landing_page_url":"https://doi.org/10.1109/sibgrapi67909.2025.11223503","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 38th SIBGRAPI Conference on Graphics, Patterns and Images (SIBGRAPI)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049366278","display_name":"Leonardo Vilela Cardoso","orcid":"https://orcid.org/0000-0003-0365-6403"},"institutions":[{"id":"https://openalex.org/I170935008","display_name":"Pontif\u00edcia Universidade Cat\u00f3lica de Minas Gerais","ror":"https://ror.org/03j1rr444","country_code":"BR","type":"education","lineage":["https://openalex.org/I170935008"]}],"countries":["BR"],"is_corresponding":true,"raw_author_name":"Leonardo V. Cardoso","raw_affiliation_strings":["Pontifical Catholic University of Minas Gerais (PUC Minas),Laboratory of Image and Multimedia Data Science (IMScience),Belo Horizonte,Brazil"],"affiliations":[{"raw_affiliation_string":"Pontifical Catholic University of Minas Gerais (PUC Minas),Laboratory of Image and Multimedia Data Science (IMScience),Belo Horizonte,Brazil","institution_ids":["https://openalex.org/I170935008"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120349510","display_name":"Bernardo P. B. V. da C. Azevedo","orcid":null},"institutions":[{"id":"https://openalex.org/I170935008","display_name":"Pontif\u00edcia Universidade Cat\u00f3lica de Minas Gerais","ror":"https://ror.org/03j1rr444","country_code":"BR","type":"education","lineage":["https://openalex.org/I170935008"]}],"countries":["BR"],"is_corresponding":false,"raw_author_name":"Bernardo P. B. V. da C. Azevedo","raw_affiliation_strings":["Pontifical Catholic University of Minas Gerais (PUC Minas),Laboratory of Image and Multimedia Data Science (IMScience),Belo Horizonte,Brazil"],"affiliations":[{"raw_affiliation_string":"Pontifical Catholic University of Minas Gerais (PUC Minas),Laboratory of Image and Multimedia Data Science (IMScience),Belo Horizonte,Brazil","institution_ids":["https://openalex.org/I170935008"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070629635","display_name":"Silvio Jamil F. Guimar\u00e3es","orcid":"https://orcid.org/0000-0001-8522-2056"},"institutions":[{"id":"https://openalex.org/I170935008","display_name":"Pontif\u00edcia Universidade Cat\u00f3lica de Minas Gerais","ror":"https://ror.org/03j1rr444","country_code":"BR","type":"education","lineage":["https://openalex.org/I170935008"]}],"countries":["BR"],"is_corresponding":false,"raw_author_name":"Silvio Jamil F. Guimar\u00e3es","raw_affiliation_strings":["Pontifical Catholic University of Minas Gerais (PUC Minas),Laboratory of Image and Multimedia Data Science (IMScience),Belo Horizonte,Brazil"],"affiliations":[{"raw_affiliation_string":"Pontifical Catholic University of Minas Gerais (PUC Minas),Laboratory of Image and Multimedia Data Science (IMScience),Belo Horizonte,Brazil","institution_ids":["https://openalex.org/I170935008"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5035002611","display_name":"Zenilton K. G. Patroc\u00ednio","orcid":"https://orcid.org/0000-0003-0804-1790"},"institutions":[{"id":"https://openalex.org/I170935008","display_name":"Pontif\u00edcia Universidade Cat\u00f3lica de Minas Gerais","ror":"https://ror.org/03j1rr444","country_code":"BR","type":"education","lineage":["https://openalex.org/I170935008"]}],"countries":["BR"],"is_corresponding":false,"raw_author_name":"Zenilton K. G. Patroc\u00ednio","raw_affiliation_strings":["Pontifical Catholic University of Minas Gerais (PUC Minas),Laboratory of Image and Multimedia Data Science (IMScience),Belo Horizonte,Brazil"],"affiliations":[{"raw_affiliation_string":"Pontifical Catholic University of Minas Gerais (PUC Minas),Laboratory of Image and Multimedia Data Science (IMScience),Belo Horizonte,Brazil","institution_ids":["https://openalex.org/I170935008"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5049366278"],"corresponding_institution_ids":["https://openalex.org/I170935008"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.3682037,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9142000079154968,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9142000079154968,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.026000000536441803,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.013899999670684338,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9190999865531921},{"id":"https://openalex.org/keywords/automatic-summarization","display_name":"Automatic summarization","score":0.763700008392334},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.6308000087738037},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5503000020980835},{"id":"https://openalex.org/keywords/dependency","display_name":"Dependency (UML)","score":0.4851999878883362},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.47859999537467957},{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance (law)","score":0.46880000829696655},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.460099995136261},{"id":"https://openalex.org/keywords/adaptation","display_name":"Adaptation (eye)","score":0.41510000824928284},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.35589998960494995}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9190999865531921},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7944999933242798},{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.763700008392334},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.6308000087738037},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6226999759674072},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5503000020980835},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.4851999878883362},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.47859999537467957},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.46880000829696655},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.460099995136261},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.41510000824928284},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4009000062942505},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.35589998960494995},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.35040000081062317},{"id":"https://openalex.org/C148483581","wikidata":"https://www.wikidata.org/wiki/Q446488","display_name":"Feature selection","level":2,"score":0.33980000019073486},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3337000012397766},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3231000006198883},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.32170000672340393},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3138999938964844},{"id":"https://openalex.org/C144986985","wikidata":"https://www.wikidata.org/wiki/Q871236","display_name":"Hierarchical database model","level":2,"score":0.3131999969482422},{"id":"https://openalex.org/C92835128","wikidata":"https://www.wikidata.org/wiki/Q1277447","display_name":"Hierarchical clustering","level":3,"score":0.30869999527931213},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.3075000047683716},{"id":"https://openalex.org/C177606310","wikidata":"https://www.wikidata.org/wiki/Q5674297","display_name":"Adaptability","level":2,"score":0.3068000078201294},{"id":"https://openalex.org/C113174947","wikidata":"https://www.wikidata.org/wiki/Q2859736","display_name":"Tree (set theory)","level":2,"score":0.2896000146865845},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.288100004196167},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.28619998693466187},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.28220000863075256},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.27799999713897705},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.27619999647140503},{"id":"https://openalex.org/C2775941552","wikidata":"https://www.wikidata.org/wiki/Q25212305","display_name":"Isolation (microbiology)","level":2,"score":0.2662999927997589},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.26510000228881836},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.26350000500679016},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.2549999952316284},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2542000114917755}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/sibgrapi67909.2025.11223503","is_oa":false,"landing_page_url":"https://doi.org/10.1109/sibgrapi67909.2025.11223503","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 38th SIBGRAPI Conference on Graphics, Patterns and Images (SIBGRAPI)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W1875942627","https://openalex.org/W2180151948","https://openalex.org/W2883910824","https://openalex.org/W2897439619","https://openalex.org/W2963351113","https://openalex.org/W2963811641","https://openalex.org/W2964110616","https://openalex.org/W2968101724","https://openalex.org/W2981851019","https://openalex.org/W2986670728","https://openalex.org/W3035237998","https://openalex.org/W3134665270","https://openalex.org/W3163093788","https://openalex.org/W4200373610","https://openalex.org/W4283463075","https://openalex.org/W4288083805","https://openalex.org/W4308842366","https://openalex.org/W4313343712","https://openalex.org/W4380077272","https://openalex.org/W4385245566","https://openalex.org/W4388115609","https://openalex.org/W4403535111","https://openalex.org/W4413185497"],"related_works":[],"abstract_inverted_index":{"We":[0,93],"present":[1],"SkimCap,":[2],"a":[3,10,17,57,102,108,132,144],"transformer-based":[4],"video":[5,25,51],"captioning":[6,64,154],"framework":[7],"that":[8,30],"integrates":[9],"memory-augmented":[11],"architecture":[12],"with":[13],"adaptive":[14,82],"attention":[15,83],"and":[16,47,69,90,107,119],"novel":[18],"feature":[19,133],"selection":[20,134],"strategy":[21],"grounded":[22,71],"in":[23],"hierarchical":[24,43,129],"skimming.":[26],"Unlike":[27],"traditional":[28],"approaches":[29],"rely":[31],"on":[32,96],"uniformly":[33],"sampled":[34],"frames":[35],"or":[36],"pre-defined":[37],"temporal":[38,85],"segments,":[39],"SkimCap":[40,95,142],"performs":[41],"unsupervised":[42],"clustering":[44],"to":[45,62,139],"identify":[46],"extract":[48],"semantically":[49],"salient":[50],"shots.":[52],"These":[53],"condensed":[54],"representations":[55],"provide":[56],"compact":[58],"yet":[59],"information-rich":[60],"input":[61],"the":[63,126],"model,":[65],"enabling":[66],"more":[67],"accurate":[68],"contextually":[70],"sentence":[72],"generation.":[73],"The":[74],"memory":[75],"module":[76],"enhances":[77],"long-range":[78],"dependency":[79],"modeling,":[80],"while":[81],"improves":[84],"alignment":[86],"between":[87],"visual":[88,150],"cues":[89],"generated":[91],"tokens.":[92],"evaluate":[94],"ActivityNet,":[97],"achieving":[98],"CIDEr-D":[99],"of":[100,105,113,128],"25.44,":[101],"BLEU-4":[103],"(B@4)":[104],"10.77,":[106],"lower":[109],"Repetition-4":[110],"(R@4)":[111],"score":[112],"5.84,":[114],"representing":[115],"consistent":[116],"caption":[117],"quality":[118],"relevance":[120],"improvements.":[121],"An":[122],"ablation":[123],"study":[124],"confirms":[125],"effectiveness":[127],"skimming":[130],"as":[131],"mechanism,":[135],"highlighting":[136],"its":[137],"contribution":[138],"overall":[140],"performance.":[141],"sets":[143],"new":[145],"direction":[146],"for":[147],"incorporating":[148],"structured":[149],"summarization":[151],"into":[152],"end-to-end":[153],"systems.":[155]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-11T00:00:00"}
