{"id":"https://openalex.org/W4385823080","doi":"https://doi.org/10.21437/interspeech.2023-943","title":"Dual Transformer Decoder based Features Fusion Network for Automated Audio Captioning","display_name":"Dual Transformer Decoder based Features Fusion Network for Automated Audio Captioning","publication_year":2023,"publication_date":"2023-08-14","ids":{"openalex":"https://openalex.org/W4385823080","doi":"https://doi.org/10.21437/interspeech.2023-943"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2023-943","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2023-943","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"INTERSPEECH 2023","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101153327","display_name":"Jianyuan Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Jianyuan Sun","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, UK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, UK","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080214850","display_name":"Xubo Liu","orcid":"https://orcid.org/0000-0002-2558-0959"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Xubo Liu","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, UK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, UK","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070892237","display_name":"Xinhao Mei","orcid":"https://orcid.org/0000-0001-6079-5130"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Xinhao Mei","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, UK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, UK","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063512115","display_name":"Volkan K\u0131l\u0131\u00e7","orcid":"https://orcid.org/0000-0002-3164-1981"},"institutions":[{"id":"https://openalex.org/I250383648","display_name":"Izmir K\u00e2tip \u00c7elebi University","ror":"https://ror.org/024nx4843","country_code":"TR","type":"education","lineage":["https://openalex.org/I250383648"]}],"countries":["TR"],"is_corresponding":false,"raw_author_name":"Volkan K\u0131l\u0131\u00e7","raw_affiliation_strings":["Department of Electrical and Electronics Engineering, Izmir Katip Celebi University, Turkey"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Electrical and Electronics Engineering, Izmir Katip Celebi University, Turkey","institution_ids":["https://openalex.org/I250383648"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066967599","display_name":"Mark D. Plumbley","orcid":"https://orcid.org/0000-0002-9708-1075"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Mark D. Plumbley","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, UK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, UK","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100676721","display_name":"Wenwu Wang","orcid":"https://orcid.org/0000-0002-8393-5703"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Wenwu Wang","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, UK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, UK","institution_ids":["https://openalex.org/I28290843"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.5543,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.63476538,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"4164","last_page":"4168"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9962999820709229,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9962999820709229,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9937000274658203,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9922000169754028,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.7930616140365601},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7586164474487305},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5316720008850098},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.5280913710594177},{"id":"https://openalex.org/keywords/dual","display_name":"Dual (grammatical number)","score":0.5209911465644836},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.44841447472572327},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.427916556596756},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2782309651374817},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.1512334942817688},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.10099378228187561},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.09189826250076294}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.7930616140365601},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7586164474487305},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5316720008850098},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.5280913710594177},{"id":"https://openalex.org/C2780980858","wikidata":"https://www.wikidata.org/wiki/Q110022","display_name":"Dual (grammatical number)","level":2,"score":0.5209911465644836},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.44841447472572327},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.427916556596756},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2782309651374817},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.1512334942817688},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.10099378228187561},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.09189826250076294},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0},{"id":"https://openalex.org/C124952713","wikidata":"https://www.wikidata.org/wiki/Q8242","display_name":"Literature","level":1,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.21437/interspeech.2023-943","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2023-943","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"INTERSPEECH 2023","raw_type":"proceedings-article"},{"id":"pmh:oai:alma.44SUR_INST:11208480450002346","is_oa":false,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4210197018","display_name":"View","issn_l":"2688-268X","issn":["2688-268X","2688-3988"],"is_oa":false,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320595","host_organization_name":"Wiley","host_organization_lineage":["https://openalex.org/P4310320595"],"host_organization_lineage_names":["Wiley"],"type":"journal"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":""}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Affordable and clean energy","score":0.6100000143051147,"id":"https://metadata.un.org/sdg/7"}],"awards":[{"id":"https://openalex.org/G8857457899","display_name":null,"funder_award_id":"EP/T019751/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"}],"funders":[{"id":"https://openalex.org/F4320322626","display_name":"T\u00fcrkiye Bilimsel ve Teknolojik Ara\u015ft\u0131rma Kurumu","ror":"https://ror.org/04w9kkr77"},{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1614298861","https://openalex.org/W1956340063","https://openalex.org/W2506483933","https://openalex.org/W2936774411","https://openalex.org/W2949376505","https://openalex.org/W2964213897","https://openalex.org/W3015591594","https://openalex.org/W3093937952","https://openalex.org/W3097791920","https://openalex.org/W3185739472","https://openalex.org/W3186781156","https://openalex.org/W3189036352","https://openalex.org/W4221152061","https://openalex.org/W4224871700","https://openalex.org/W4295190891","https://openalex.org/W4297853167","https://openalex.org/W4372266890","https://openalex.org/W4385822505"],"related_works":["https://openalex.org/W4320016117","https://openalex.org/W4307856881","https://openalex.org/W2901467237","https://openalex.org/W2547835662","https://openalex.org/W3183824823","https://openalex.org/W3008515501","https://openalex.org/W2923366293","https://openalex.org/W2905654560","https://openalex.org/W2519434724","https://openalex.org/W2596543464"],"abstract_inverted_index":{"Automated":[0],"audio":[1,9,78,93,119],"captioning":[2,120],"(AAC)":[3],"which":[4],"generates":[5],"textual":[6],"descriptions":[7],"of":[8,22,30,41,81],"content.Existing":[10],"AAC":[11],"models":[12],"achieve":[13],"good":[14],"results":[15,113],"but":[16],"only":[17],"use":[18],"the":[19,23,50,98],"high-dimensional":[20,31,35,70,83],"representation":[21],"encoder.There":[24],"is":[25,56],"always":[26],"insufficient":[27],"information":[28,75],"learning":[29],"methods":[32],"owing":[33],"to":[34,67,108],"representations":[36],"having":[37],"a":[38,45,59,103],"large":[39],"amount":[40],"information.In":[42],"this":[43],"paper,":[44],"new":[46,60],"encoder-decoder":[47],"model":[48,87],"called":[49,63],"Lowand":[51],"High-Dimensional":[52],"Feature":[53],"Fusion":[54],"(LHDFF)":[55],"proposed.LHDFF":[57],"uses":[58,102],"PANNs":[61,65],"encoder":[62],"Residual":[64],"(RPANNs)":[66],"fuse":[68],"low-and":[69,82],"features.Lowdimensional":[71],"features":[72,84],"contain":[73],"limited":[74],"about":[76],"specific":[77,92],"scenes.The":[79],"fusion":[80],"can":[85],"improve":[86],"performance":[88],"by":[89],"repeatedly":[90],"emphasizing":[91],"scene":[94],"information.To":[95],"fully":[96],"exploit":[97],"fused":[99],"features,":[100],"LHDFF":[101,116],"dual":[104],"transformer":[105],"decoder":[106],"structure":[107],"generate":[109],"captions":[110],"in":[111],"parallel.Experimental":[112],"show":[114],"that":[115],"outperforms":[117],"existing":[118],"models.":[121]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
