{"id":"https://openalex.org/W2726051499","doi":"https://doi.org/10.1109/waspaa.2017.8170058","title":"Automated audio captioning with recurrent neural networks","display_name":"Automated audio captioning with recurrent neural networks","publication_year":2017,"publication_date":"2017-10-01","ids":{"openalex":"https://openalex.org/W2726051499","doi":"https://doi.org/10.1109/waspaa.2017.8170058","mag":"2726051499"},"language":"en","primary_location":{"id":"doi:10.1109/waspaa.2017.8170058","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa.2017.8170058","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2017 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/1706.10006","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5108358814","display_name":"Konstantinos Drossos","orcid":null},"institutions":[{"id":"https://openalex.org/I166825849","display_name":"Tampere University","ror":"https://ror.org/033003e23","country_code":"FI","type":"education","lineage":["https://openalex.org/I166825849"]},{"id":"https://openalex.org/I150589677","display_name":"Tampere University of Applied Sciences","ror":"https://ror.org/00bwtjf83","country_code":"FI","type":"education","lineage":["https://openalex.org/I150589677"]},{"id":"https://openalex.org/I4210133110","display_name":"Tampere University","ror":null,"country_code":"FI","type":null,"lineage":["https://openalex.org/I4210133110"]}],"countries":["FI"],"is_corresponding":true,"raw_author_name":"Konstantinos Drossos","raw_affiliation_strings":["Tampere University of Technology, Tampere, Finland","Tampere university of technology, Tampere, Finland"],"affiliations":[{"raw_affiliation_string":"Tampere University of Technology, Tampere, Finland","institution_ids":["https://openalex.org/I4210133110"]},{"raw_affiliation_string":"Tampere university of technology, Tampere, Finland","institution_ids":["https://openalex.org/I166825849","https://openalex.org/I150589677","https://openalex.org/I4210133110"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054490307","display_name":"Sharath Adavanne","orcid":"https://orcid.org/0000-0002-5001-6911"},"institutions":[{"id":"https://openalex.org/I150589677","display_name":"Tampere University of Applied Sciences","ror":"https://ror.org/00bwtjf83","country_code":"FI","type":"education","lineage":["https://openalex.org/I150589677"]},{"id":"https://openalex.org/I4210133110","display_name":"Tampere University","ror":null,"country_code":"FI","type":null,"lineage":["https://openalex.org/I4210133110"]},{"id":"https://openalex.org/I166825849","display_name":"Tampere University","ror":"https://ror.org/033003e23","country_code":"FI","type":"education","lineage":["https://openalex.org/I166825849"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Sharath Adavanne","raw_affiliation_strings":["Tampere University of Technology, Tampere, Finland","Tampere university of technology, Tampere, Finland"],"affiliations":[{"raw_affiliation_string":"Tampere University of Technology, Tampere, Finland","institution_ids":["https://openalex.org/I4210133110"]},{"raw_affiliation_string":"Tampere university of technology, Tampere, Finland","institution_ids":["https://openalex.org/I166825849","https://openalex.org/I150589677","https://openalex.org/I4210133110"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5049691461","display_name":"Tuomas Virtanen","orcid":"https://orcid.org/0000-0002-4604-9729"},"institutions":[{"id":"https://openalex.org/I166825849","display_name":"Tampere University","ror":"https://ror.org/033003e23","country_code":"FI","type":"education","lineage":["https://openalex.org/I166825849"]},{"id":"https://openalex.org/I4210133110","display_name":"Tampere University","ror":null,"country_code":"FI","type":null,"lineage":["https://openalex.org/I4210133110"]},{"id":"https://openalex.org/I150589677","display_name":"Tampere University of Applied Sciences","ror":"https://ror.org/00bwtjf83","country_code":"FI","type":"education","lineage":["https://openalex.org/I150589677"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Tuomas Virtanen","raw_affiliation_strings":["Tampere University of Technology, Tampere, Finland","Tampere university of technology, Tampere, Finland"],"affiliations":[{"raw_affiliation_string":"Tampere University of Technology, Tampere, Finland","institution_ids":["https://openalex.org/I4210133110"]},{"raw_affiliation_string":"Tampere university of technology, Tampere, Finland","institution_ids":["https://openalex.org/I166825849","https://openalex.org/I150589677","https://openalex.org/I4210133110"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5108358814"],"corresponding_institution_ids":["https://openalex.org/I150589677","https://openalex.org/I166825849","https://openalex.org/I4210133110"],"apc_list":null,"apc_paid":null,"fwci":0.40470425,"has_fulltext":true,"cited_by_count":7,"citation_normalized_percentile":{"value":0.56106317,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"374","last_page":"378"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9940999746322632,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9940999746322632,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9656970500946045},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.7995878458023071},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7987061738967896},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.5968742370605469},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.5732200145721436},{"id":"https://openalex.org/keywords/layer","display_name":"Layer (electronics)","score":0.5688011050224304},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5555692315101624},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.5484923720359802},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4658048748970032},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.41088372468948364},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.36599159240722656},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3482569456100464},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3324926495552063},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.0668560266494751}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9656970500946045},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.7995878458023071},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7987061738967896},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.5968742370605469},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.5732200145721436},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.5688011050224304},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5555692315101624},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.5484923720359802},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4658048748970032},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.41088372468948364},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.36599159240722656},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3482569456100464},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3324926495552063},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0668560266494751},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.0},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1109/waspaa.2017.8170058","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa.2017.8170058","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2017 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:1706.10006","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1706.10006","pdf_url":"https://arxiv.org/pdf/1706.10006","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"mag:2726051499","is_oa":true,"landing_page_url":"http://export.arxiv.org/pdf/1706.10006","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.1706.10006","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.1706.10006","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:1706.10006","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1706.10006","pdf_url":"https://arxiv.org/pdf/1706.10006","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320309480","display_name":"Nvidia","ror":"https://ror.org/03jdj4y14"},{"id":"https://openalex.org/F4320322725","display_name":"China Scholarship Council","ror":"https://ror.org/04atp4p48"},{"id":"https://openalex.org/F4320332999","display_name":"Horizon 2020 Framework Programme","ror":"https://ror.org/00k4n6c32"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2726051499.pdf","grobid_xml":"https://content.openalex.org/works/W2726051499.grobid-xml"},"referenced_works_count":30,"referenced_works":["https://openalex.org/W1889081078","https://openalex.org/W1895577753","https://openalex.org/W1956340063","https://openalex.org/W2034213230","https://openalex.org/W2101105183","https://openalex.org/W2133512280","https://openalex.org/W2154652894","https://openalex.org/W2157331557","https://openalex.org/W2302086703","https://openalex.org/W2384495648","https://openalex.org/W2405676915","https://openalex.org/W2525778437","https://openalex.org/W2550821151","https://openalex.org/W2594627932","https://openalex.org/W2950635152","https://openalex.org/W2953022248","https://openalex.org/W2963099423","https://openalex.org/W2963498278","https://openalex.org/W2964121744","https://openalex.org/W2964241990","https://openalex.org/W2964308564","https://openalex.org/W6631190155","https://openalex.org/W6631943919","https://openalex.org/W6639432524","https://openalex.org/W6679434410","https://openalex.org/W6682631176","https://openalex.org/W6696761078","https://openalex.org/W6727690538","https://openalex.org/W6729214272","https://openalex.org/W6898505805"],"related_works":["https://openalex.org/W2806393949","https://openalex.org/W3003130078","https://openalex.org/W3129415484","https://openalex.org/W2968304700","https://openalex.org/W2737505212","https://openalex.org/W3097916481","https://openalex.org/W3091422775","https://openalex.org/W3075961206","https://openalex.org/W3131889802","https://openalex.org/W3122879976","https://openalex.org/W2991723581","https://openalex.org/W3079229479","https://openalex.org/W1514329655","https://openalex.org/W2932074147","https://openalex.org/W3142367608","https://openalex.org/W3179094930","https://openalex.org/W3178957312","https://openalex.org/W2831615018","https://openalex.org/W3153404572","https://openalex.org/W3018144978"],"abstract_inverted_index":{"We":[0,9],"present":[1],"the":[2,23,38,59,70,74,80,128,136],"first":[3],"approach":[4],"to":[5,22,69],"automated":[6],"audio":[7,35],"captioning.":[8],"employ":[10],"an":[11,15,34],"encoder-decoder":[12],"scheme":[13],"with":[14,64,87],"alignment":[16,81],"model":[17,82],"in":[18,116,135],"between.":[19],"The":[20,48,76,92,108],"input":[21],"encoder":[24,49],"is":[25,40,50,95],"a":[26,41,46,51,61,65,101],"sequence":[27,42],"of":[28,43,73],"log":[29],"mel-band":[30],"energies":[31],"calculated":[32],"from":[33,100,124],"file,":[36],"while":[37],"output":[39],"words,":[44],"i.e.":[45],"caption.":[47],"multi-layered,":[52],"bi-directional":[53],"gated":[54],"recurrent":[55],"unit":[56],"(GRU)":[57],"and":[58,79,119],"decoder":[60],"multi-layered":[62],"GRU":[63,72],"classification":[66,77],"layer":[67,78],"connected":[68,85],"last":[71],"decoder.":[75],"are":[83],"fully":[84],"layers":[86],"shared":[88],"weights":[89],"between":[90],"timesteps.":[91],"proposed":[93,129],"method":[94,130],"evaluated":[96],"using":[97],"data":[98],"drawn":[99],"commercial":[102],"sound":[103],"effects":[104],"library,":[105],"ProSound":[106],"Effects.":[107],"resulting":[109],"captions":[110],"were":[111],"rated":[112],"through":[113],"metrics":[114,125],"utilized":[115],"machine":[117],"translation":[118],"image":[120],"captioning":[121],"fields.":[122],"Results":[123],"show":[126],"that":[127],"can":[131],"predict":[132],"words":[133],"appearing":[134],"original":[137],"caption,":[138],"but":[139],"not":[140],"always":[141],"correctly":[142],"ordered.":[143]},"counts_by_year":[{"year":2023,"cited_by_count":2},{"year":2021,"cited_by_count":3},{"year":2019,"cited_by_count":2}],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2025-10-10T00:00:00"}
