{"id":"https://openalex.org/W2984172489","doi":"https://doi.org/10.1145/3347449.3357484","title":"L-STAP: Learned Spatio-Temporal Adaptive Pooling for Video Captioning","display_name":"L-STAP: Learned Spatio-Temporal Adaptive Pooling for Video Captioning","publication_year":2019,"publication_date":"2019-10-21","ids":{"openalex":"https://openalex.org/W2984172489","doi":"https://doi.org/10.1145/3347449.3357484","mag":"2984172489"},"language":"en","primary_location":{"id":"doi:10.1145/3347449.3357484","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3347449.3357484","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 1st International Workshop on AI for Smart TV Content Production, Access and Delivery","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://hal.science/hal-03555313","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5109744334","display_name":"Danny Francis","orcid":null},"institutions":[{"id":"https://openalex.org/I1902872","display_name":"EURECOM","ror":"https://ror.org/00sse7z02","country_code":"FR","type":"education","lineage":["https://openalex.org/I1902872","https://openalex.org/I205703379"]}],"countries":["FR"],"is_corresponding":true,"raw_author_name":"Danny Francis","raw_affiliation_strings":["EURECOM, Biot, France","Eurecom [Sophia Antipolis] (Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex - France)"],"affiliations":[{"raw_affiliation_string":"EURECOM, Biot, France","institution_ids":["https://openalex.org/I1902872"]},{"raw_affiliation_string":"Eurecom [Sophia Antipolis] (Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex - France)","institution_ids":["https://openalex.org/I1902872"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5038148603","display_name":"Beno\u00eet Huet","orcid":"https://orcid.org/0000-0002-0608-6939"},"institutions":[{"id":"https://openalex.org/I1902872","display_name":"EURECOM","ror":"https://ror.org/00sse7z02","country_code":"FR","type":"education","lineage":["https://openalex.org/I1902872","https://openalex.org/I205703379"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Benoit Huet","raw_affiliation_strings":["EURECOM, Biot, France","Eurecom [Sophia Antipolis] (Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex - France)"],"affiliations":[{"raw_affiliation_string":"EURECOM, Biot, France","institution_ids":["https://openalex.org/I1902872"]},{"raw_affiliation_string":"Eurecom [Sophia Antipolis] (Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex - France)","institution_ids":["https://openalex.org/I1902872"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5109744334"],"corresponding_institution_ids":["https://openalex.org/I1902872"],"apc_list":null,"apc_paid":null,"fwci":0.4089,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.66482208,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"33","last_page":"41"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9805669784545898},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8665087819099426},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6760817170143127},{"id":"https://openalex.org/keywords/pooling","display_name":"Pooling","score":0.6630255579948425},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.5950286388397217},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5635395646095276},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.49831056594848633},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.4774399399757385},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.41498705744743347},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4055968225002289},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.38215112686157227},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3645437955856323},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3310549259185791},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.1986168920993805}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9805669784545898},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8665087819099426},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6760817170143127},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.6630255579948425},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.5950286388397217},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5635395646095276},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.49831056594848633},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.4774399399757385},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.41498705744743347},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4055968225002289},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.38215112686157227},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3645437955856323},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3310549259185791},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.1986168920993805},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3347449.3357484","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3347449.3357484","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 1st International Workshop on AI for Smart TV Content Production, Access and Delivery","raw_type":"proceedings-article"},{"id":"pmh:oai:HAL:hal-03555313v1","is_oa":true,"landing_page_url":"https://hal.science/hal-03555313","pdf_url":null,"source":{"id":"https://openalex.org/S4306402512","display_name":"HAL (Le Centre pour la Communication Scientifique Directe)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1294671590","host_organization_name":"Centre National de la Recherche Scientifique","host_organization_lineage":["https://openalex.org/I1294671590"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"AI4TV 2019, 1st International Workshop on AI for Smart TV Content Production, Access and Delivery, Oct 2019, Nice, France. pp.33-41, &#x27E8;10.1145/3347449.3357484&#x27E9;","raw_type":"Conference papers"},{"id":"pmh:oai:fr.eurecom:6051","is_oa":false,"landing_page_url":"http://www.eurecom.fr/publication/6051","pdf_url":null,"source":{"id":"https://openalex.org/S4377196942","display_name":"Graduate School and Research Center in Digital Science (EURECOM)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1902872","host_organization_name":"EURECOM","host_organization_lineage":["https://openalex.org/I1902872"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"AI4TV 2019, 1st International Workshop on AI for smart TV content production, access and delivery, co-located with the 27th ACM International Conference on Multimedia, 21 October 2019, Nice, France","raw_type":"Conference"}],"best_oa_location":{"id":"pmh:oai:HAL:hal-03555313v1","is_oa":true,"landing_page_url":"https://hal.science/hal-03555313","pdf_url":null,"source":{"id":"https://openalex.org/S4306402512","display_name":"HAL (Le Centre pour la Communication Scientifique Directe)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1294671590","host_organization_name":"Centre National de la Recherche Scientifique","host_organization_lineage":["https://openalex.org/I1294671590"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"AI4TV 2019, 1st International Workshop on AI for Smart TV Content Production, Access and Delivery, Oct 2019, Nice, France. pp.33-41, &#x27E8;10.1145/3347449.3357484&#x27E9;","raw_type":"Conference papers"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7799999713897705,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W1586939924","https://openalex.org/W1686810756","https://openalex.org/W1861492603","https://openalex.org/W1895577753","https://openalex.org/W1902237438","https://openalex.org/W1947481528","https://openalex.org/W1956340063","https://openalex.org/W2064675550","https://openalex.org/W2097117768","https://openalex.org/W2101105183","https://openalex.org/W2108598243","https://openalex.org/W2110933980","https://openalex.org/W2133459682","https://openalex.org/W2139501017","https://openalex.org/W2154652894","https://openalex.org/W2157331557","https://openalex.org/W2163605009","https://openalex.org/W2164290393","https://openalex.org/W2194775991","https://openalex.org/W2425121537","https://openalex.org/W2527145521","https://openalex.org/W2554906389","https://openalex.org/W2556388456","https://openalex.org/W2565656701","https://openalex.org/W2745461083","https://openalex.org/W2766520430","https://openalex.org/W2774267535","https://openalex.org/W2798725893","https://openalex.org/W2895845501","https://openalex.org/W2896878184","https://openalex.org/W2905172366","https://openalex.org/W2949888546","https://openalex.org/W2953384591","https://openalex.org/W2962681491","https://openalex.org/W2962937869","https://openalex.org/W2963084599","https://openalex.org/W2963177403","https://openalex.org/W2963524571","https://openalex.org/W2963552819","https://openalex.org/W2964065937","https://openalex.org/W4236965008","https://openalex.org/W6713134421"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W3088136942","https://openalex.org/W2949362007","https://openalex.org/W2775506363","https://openalex.org/W4290852288","https://openalex.org/W4388893791","https://openalex.org/W4283207562","https://openalex.org/W2963177403","https://openalex.org/W2330246314","https://openalex.org/W2810679507"],"abstract_inverted_index":{"Automatic":[0],"video":[1,96,137,190],"captioning":[2,38,97,191],"can":[3,17,25,39],"be":[4,18,27,40,65],"used":[5,28],"to":[6,29,54,64,82,88,100,152,161],"enrich":[7],"TV":[8,35],"programs":[9],"with":[10,102],"textual":[11,71],"informations":[12,16],"on":[13,169,176,188],"scenes.":[14],"These":[15],"useful":[19],"for":[20],"visually":[21],"impaired":[22],"people,":[23],"but":[24,86],"also":[26,87],"enhance":[30],"indexing":[31],"and":[32,67,108,128,178],"research":[33],"of":[34,111,146,195],"records.":[36],"Video":[37],"seen":[41],"as":[42],"being":[43],"more":[44],"challenging":[45,57],"than":[46],"image":[47],"captioning.":[48],"In":[49,113],"both":[50],"cases,":[51],"we":[52,116,133,155],"have":[53,98],"tackle":[55],"a":[56,60,70,118,136,140,165],"task":[58,192],"where":[59],"visual":[61],"object":[62],"has":[63],"analyzed,":[66],"translated":[68],"into":[69],"description":[72],"in":[73,95,164,172,193],"natural":[74],"language.":[75],"However,":[76],"analyzing":[77],"videos":[78],"requires":[79],"not":[80],"only":[81],"parse":[83],"still":[84],"images,":[85],"draw":[89],"correspondences":[90],"through":[91,139],"time.":[92],"Recent":[93],"works":[94],"intended":[99],"deal":[101],"these":[103],"issues":[104],"by":[105],"separating":[106],"spatial":[107,127],"temporal":[109,129],"analysis":[110],"videos.":[112],"this":[114],"paper,":[115],"propose":[117],"Learned":[119],"Spatio-Temporal":[120],"Adaptive":[121],"Pooling":[122],"(L-STAP)":[123],"method":[124,184],"that":[125,182],"combines":[126],"analysis.":[130],"More":[131],"specifically,":[132],"first":[134],"process":[135],"frame-by-frame":[138],"Convolutional":[141],"Neural":[142],"Network.":[143],"Then,":[144],"instead":[145],"applying":[147],"an":[148],"average":[149],"pooling":[150],"operation":[151],"reduce":[153],"dimensionality,":[154],"apply":[156],"our":[157,183],"L-STAP,":[158],"which":[159],"attends":[160],"specific":[162],"regions":[163],"given":[166],"frame":[167],"based":[168],"what":[170],"appeared":[171],"previous":[173],"frames.":[174],"Experiments":[175],"MSVD":[177],"MSR-VTT":[179],"datasets":[180],"show":[181],"outperforms":[185],"state-of-the-art":[186],"methods":[187],"the":[189],"terms":[194],"several":[196],"evaluation":[197],"metrics.":[198]},"counts_by_year":[{"year":2021,"cited_by_count":2},{"year":2019,"cited_by_count":2},{"year":2012,"cited_by_count":1}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
