{"id":"https://openalex.org/W2785892019","doi":"https://doi.org/10.1109/asru.2017.8268968","title":"Early and late integration of audio features for automatic video description","display_name":"Early and late integration of audio features for automatic video description","publication_year":2017,"publication_date":"2017-12-01","ids":{"openalex":"https://openalex.org/W2785892019","doi":"https://doi.org/10.1109/asru.2017.8268968","mag":"2785892019"},"language":"en","primary_location":{"id":"doi:10.1109/asru.2017.8268968","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru.2017.8268968","pdf_url":null,"source":{"id":"https://openalex.org/S4306498158","display_name":"2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001601327","display_name":"Chiori Hori","orcid":"https://orcid.org/0000-0002-4201-7578"},"institutions":[{"id":"https://openalex.org/I4210159266","display_name":"Mitsubishi Electric (United States)","ror":"https://ror.org/053jnhe44","country_code":"US","type":"company","lineage":["https://openalex.org/I1306287861","https://openalex.org/I4210133125","https://openalex.org/I4210159266"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Chiori Hori","raw_affiliation_strings":["Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA","institution_ids":["https://openalex.org/I4210159266"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087554069","display_name":"Takaaki Hori","orcid":"https://orcid.org/0000-0003-4560-8039"},"institutions":[{"id":"https://openalex.org/I4210159266","display_name":"Mitsubishi Electric (United States)","ror":"https://ror.org/053jnhe44","country_code":"US","type":"company","lineage":["https://openalex.org/I1306287861","https://openalex.org/I4210133125","https://openalex.org/I4210159266"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Takaaki Hori","raw_affiliation_strings":["Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA","institution_ids":["https://openalex.org/I4210159266"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008369672","display_name":"Tim K. Marks","orcid":null},"institutions":[{"id":"https://openalex.org/I4210159266","display_name":"Mitsubishi Electric (United States)","ror":"https://ror.org/053jnhe44","country_code":"US","type":"company","lineage":["https://openalex.org/I1306287861","https://openalex.org/I4210133125","https://openalex.org/I4210159266"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tim K. Marks","raw_affiliation_strings":["Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA","institution_ids":["https://openalex.org/I4210159266"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5112763337","display_name":"John R. Hershey","orcid":null},"institutions":[{"id":"https://openalex.org/I4210159266","display_name":"Mitsubishi Electric (United States)","ror":"https://ror.org/053jnhe44","country_code":"US","type":"company","lineage":["https://openalex.org/I1306287861","https://openalex.org/I4210133125","https://openalex.org/I4210159266"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"John R. Hershey","raw_affiliation_strings":["Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Mitsubishi Electric Research Laboratories (MERL), Cambridge, MA, USA","institution_ids":["https://openalex.org/I4210159266"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5001601327"],"corresponding_institution_ids":["https://openalex.org/I4210159266"],"apc_list":null,"apc_paid":null,"fwci":1.8492,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.87449113,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9954000115394592,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.785224199295044},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.3529757261276245},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3459780812263489},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.32758867740631104}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.785224199295044},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.3529757261276245},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3459780812263489},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.32758867740631104}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru.2017.8268968","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru.2017.8268968","pdf_url":null,"source":{"id":"https://openalex.org/S4306498158","display_name":"2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2017 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W173561343","https://openalex.org/W854541894","https://openalex.org/W1522734439","https://openalex.org/W1528470941","https://openalex.org/W1573040851","https://openalex.org/W1586939924","https://openalex.org/W1686810756","https://openalex.org/W1889081078","https://openalex.org/W1904457459","https://openalex.org/W1957740064","https://openalex.org/W2016053056","https://openalex.org/W2101105183","https://openalex.org/W2118972857","https://openalex.org/W2133459682","https://openalex.org/W2133564696","https://openalex.org/W2136657878","https://openalex.org/W2142900973","https://openalex.org/W2163605009","https://openalex.org/W2425121537","https://openalex.org/W2584992898","https://openalex.org/W2962756039","https://openalex.org/W2962835968","https://openalex.org/W2963576560","https://openalex.org/W2963911037","https://openalex.org/W2964308564","https://openalex.org/W4293665662","https://openalex.org/W6607114211","https://openalex.org/W6634426396","https://openalex.org/W6637373629","https://openalex.org/W6638444622","https://openalex.org/W6639432524","https://openalex.org/W6640980151","https://openalex.org/W6679434410","https://openalex.org/W6680129566","https://openalex.org/W6684191040","https://openalex.org/W6729831399","https://openalex.org/W6898505805"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"This":[0],"paper":[1],"presents":[2],"our":[3,100,190],"approach":[4],"to":[5,25,34,73,110,137,211],"improve":[6,139,171],"video":[7,13,35,76],"captioning":[8,16,36],"by":[9,178],"integrating":[10],"audio":[11,52,115,197,203,221],"and":[12,51,54,95,114,149,156,164,201,214],"features.":[14],"Video":[15],"is":[17],"the":[18,27,63,75,118,124,140,154,172,179,187,202,220],"task":[19],"of":[20,29,58,162,167,189,196],"generating":[21],"a":[22,30,44,56],"textual":[23],"description":[24],"describe":[26,62],"content":[28],"video.":[31],"State-of-the-art":[32],"approaches":[33],"are":[37,121],"based":[38,131],"on":[39,132,144],"sequence-to-sequence":[40],"models,":[41],"in":[42,66,87,123],"which":[43,83,208],"single":[45],"neural":[46],"network":[47,70],"accepts":[48],"sequential":[49],"images":[50],"data,":[53],"outputs":[55],"sequence":[57],"words":[59],"that":[60,160],"best":[61],"input":[64,77],"data":[65],"natural":[67],"language.":[68],"The":[69],"thus":[71],"learns":[72],"encode":[74],"into":[78],"an":[79,105],"intermediate":[80],"semantic":[81,174],"representation,":[82,175],"can":[84],"be":[85],"useful":[86],"applications":[88],"such":[89],"as":[90,176],"multimedia":[91],"indexing,":[92],"automatic":[93],"narration,":[94],"audio-visual":[96,173],"question":[97],"answering.":[98],"In":[99,183],"prior":[101],"work,":[102],"we":[103,127,185],"proposed":[104],"attention-based":[106],"multi-modal":[107],"fusion":[108],"mechanism":[109],"integrate":[111],"image,":[112],"motion,":[113],"features,":[116,200],"where":[117],"multiple":[119],"features":[120,169,204],"integrated":[122],"network.":[125],"Here,":[126],"apply":[128],"hypothesis-level":[129],"integration":[130,166],"minimum":[133],"Bayes-risk":[134],"(MBR)":[135],"decoding":[136],"further":[138],"caption":[141,181],"quality,":[142],"focusing":[143],"well-known":[145],"evaluation":[146],"metrics":[147],"(BLEU":[148],"METEOR":[150],"scores).":[151],"Experiments":[152],"with":[153],"YouTube2Text":[155],"MSR-VTT":[157],"datasets":[158],"demonstrate":[159],"combinations":[161],"early":[163],"late":[165],"multimodal":[168],"significantly":[170],"measured":[177],"resulting":[180],"quality.":[182],"addition,":[184],"compared":[186],"performance":[188],"method":[191],"using":[192,206,218],"two":[193],"different":[194],"types":[195],"features:":[198],"MFCC":[199],"extracted":[205],"SoundNet,":[207],"was":[209],"trained":[210],"recognize":[212],"objects":[213],"scenes":[215],"from":[216],"videos":[217],"only":[219],"signals.":[222]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":2},{"year":2020,"cited_by_count":5},{"year":2019,"cited_by_count":5}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
