{"id":"https://openalex.org/W4387609319","doi":"https://doi.org/10.1109/taslp.2023.3321968","title":"Beyond the Status Quo: A Contemporary Survey of Advances and Challenges in Audio Captioning","display_name":"Beyond the Status Quo: A Contemporary Survey of Advances and Challenges in Audio Captioning","publication_year":2023,"publication_date":"2023-10-13","ids":{"openalex":"https://openalex.org/W4387609319","doi":"https://doi.org/10.1109/taslp.2023.3321968"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2023.3321968","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3321968","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025827045","display_name":"Xuenan Xu","orcid":"https://orcid.org/0000-0001-8718-1278"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xuenan Xu","raw_affiliation_strings":["X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000223918","display_name":"Zeyu Xie","orcid":"https://orcid.org/0009-0001-9546-3301"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zeyu Xie","raw_affiliation_strings":["X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081865665","display_name":"Mengyue Wu","orcid":"https://orcid.org/0000-0002-5599-8707"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengyue Wu","raw_affiliation_strings":["X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043098653","display_name":"Kai Yu","orcid":"https://orcid.org/0000-0002-7102-9826"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kai Yu","raw_affiliation_strings":["X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"X-LANCE Lab, Department of Computer Science and Engineering, MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5025827045"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":3.0498,"has_fulltext":false,"cited_by_count":15,"citation_normalized_percentile":{"value":0.92664455,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":"32","issue":null,"first_page":"95","last_page":"112"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9883999824523926,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9715999960899353,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.882215678691864},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8238918781280518},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5397395491600037},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5141058564186096},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.510296106338501},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4510573148727417},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.4407590925693512},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.42976123094558716},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3770524561405182},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.35702353715896606},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.33344683051109314}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.882215678691864},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8238918781280518},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5397395491600037},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5141058564186096},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.510296106338501},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4510573148727417},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.4407590925693512},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.42976123094558716},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3770524561405182},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.35702353715896606},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.33344683051109314},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2023.3321968","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3321968","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.8199999928474426,"display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G3420185677","display_name":null,"funder_award_id":"61901265","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":118,"referenced_works":["https://openalex.org/W68733909","https://openalex.org/W648786980","https://openalex.org/W1523493493","https://openalex.org/W1861492603","https://openalex.org/W1897761818","https://openalex.org/W2016589492","https://openalex.org/W2033875152","https://openalex.org/W2081580037","https://openalex.org/W2123301721","https://openalex.org/W2154652894","https://openalex.org/W2250378130","https://openalex.org/W2506483933","https://openalex.org/W2526050071","https://openalex.org/W2593116425","https://openalex.org/W2627092829","https://openalex.org/W2766375149","https://openalex.org/W2788277448","https://openalex.org/W2896457183","https://openalex.org/W2903928149","https://openalex.org/W2905139236","https://openalex.org/W2916103538","https://openalex.org/W2936695845","https://openalex.org/W2936774411","https://openalex.org/W2938704169","https://openalex.org/W2963084599","https://openalex.org/W2963351448","https://openalex.org/W2964213897","https://openalex.org/W2970641574","https://openalex.org/W2982554818","https://openalex.org/W2982669287","https://openalex.org/W3015591594","https://openalex.org/W3037013468","https://openalex.org/W3037468418","https://openalex.org/W3038899388","https://openalex.org/W3042165234","https://openalex.org/W3088092535","https://openalex.org/W3093839391","https://openalex.org/W3094502228","https://openalex.org/W3094550259","https://openalex.org/W3094760673","https://openalex.org/W3097791920","https://openalex.org/W3112467147","https://openalex.org/W3122335742","https://openalex.org/W3135656708","https://openalex.org/W3160577380","https://openalex.org/W3163843406","https://openalex.org/W3174770825","https://openalex.org/W3185739472","https://openalex.org/W3186781156","https://openalex.org/W3187963534","https://openalex.org/W3189036352","https://openalex.org/W3196974791","https://openalex.org/W3198452188","https://openalex.org/W3198860978","https://openalex.org/W3201326582","https://openalex.org/W3204936728","https://openalex.org/W3205475937","https://openalex.org/W3205708381","https://openalex.org/W3205860970","https://openalex.org/W3206857696","https://openalex.org/W3207373632","https://openalex.org/W4206047378","https://openalex.org/W4221153784","https://openalex.org/W4224926581","https://openalex.org/W4224932123","https://openalex.org/W4226174231","https://openalex.org/W4280567182","https://openalex.org/W4287075195","https://openalex.org/W4287751091","https://openalex.org/W4295723153","https://openalex.org/W4297841258","https://openalex.org/W4297841355","https://openalex.org/W4297841816","https://openalex.org/W4304460126","https://openalex.org/W4309129408","https://openalex.org/W4312095907","https://openalex.org/W4312120844","https://openalex.org/W4312898271","https://openalex.org/W4315701203","https://openalex.org/W4320002535","https://openalex.org/W4362706672","https://openalex.org/W4364373974","https://openalex.org/W4372260051","https://openalex.org/W4372340819","https://openalex.org/W4372341409","https://openalex.org/W4375869156","https://openalex.org/W4383340360","https://openalex.org/W4384160435","https://openalex.org/W4385245566","https://openalex.org/W4385822505","https://openalex.org/W4385822843","https://openalex.org/W4400033239","https://openalex.org/W6621543089","https://openalex.org/W6639102338","https://openalex.org/W6739901393","https://openalex.org/W6752516136","https://openalex.org/W6755207826","https://openalex.org/W6757408916","https://openalex.org/W6761205521","https://openalex.org/W6761551260","https://openalex.org/W6780379688","https://openalex.org/W6780673046","https://openalex.org/W6783591283","https://openalex.org/W6784333009","https://openalex.org/W6784400889","https://openalex.org/W6784499681","https://openalex.org/W6787564935","https://openalex.org/W6798955355","https://openalex.org/W6799178980","https://openalex.org/W6799303324","https://openalex.org/W6799494382","https://openalex.org/W6801865844","https://openalex.org/W6802577842","https://openalex.org/W6802998326","https://openalex.org/W6811156277","https://openalex.org/W6846793904","https://openalex.org/W6850843143","https://openalex.org/W6851994061"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W2949362007","https://openalex.org/W2775506363","https://openalex.org/W4290852288","https://openalex.org/W3088136942","https://openalex.org/W4283207562","https://openalex.org/W2963177403","https://openalex.org/W2330246314","https://openalex.org/W2949522393","https://openalex.org/W4289422896"],"abstract_inverted_index":{"Automated":[0],"audio":[1,15],"captioning":[2],"(AAC),":[3],"a":[4,52,123],"task":[5],"that":[6],"mimics":[7],"human":[8],"perception":[9],"as":[10,12,35,122],"well":[11],"innovatively":[13],"links":[14],"processing":[16],"and":[17,40,47,77,88,107,131,141],"natural":[18],"language":[19,101],"processing,":[20],"has":[21],"overseen":[22],"much":[23],"progress":[24],"over":[25],"the":[26,36,41,62,113,127,135,142],"last":[27],"few":[28],"years.":[29],"AAC":[30,130],"requires":[31],"recognizing":[32],"contents":[33],"such":[34],"environment,":[37],"sound":[38,45],"events":[39,46],"temporal":[42],"relationships":[43],"between":[44,129],"describing":[48],"these":[49],"elements":[50],"with":[51,147],"fluent":[53],"sentence.":[54],"Currently,":[55],"an":[56],"encoder-decoder-based":[57],"deep":[58,137],"learning":[59,87,138],"framework":[60],"is":[61],"standard":[63],"approach":[64],"to":[65,150],"tackle":[66],"this":[67,116],"problem.":[68],"Plenty":[69],"of":[70,115],"works":[71],"have":[72],"proposed":[73],"novel":[74],"network":[75],"architectures":[76],"training":[78],"schemes,":[79],"including":[80],"extra":[81],"guidance,":[82],"reinforcement":[83],"learning,":[84],"audio-text":[85],"self-supervised":[86],"diverse":[89],"or":[90],"controllable":[91],"captioning.":[92],"Effective":[93],"data":[94],"augmentation":[95],"techniques,":[96,139],"especially":[97],"based":[98],"on":[99],"large":[100],"models":[102],"are":[103],"explored.":[104],"Benchmark":[105],"datasets":[106],"AAC-oriented":[108],"evaluation":[109,143],"metrics":[110,144],"also":[111],"accelerate":[112],"improvement":[114],"field.":[117],"This":[118],"article":[119],"situates":[120],"itself":[121],"comprehensive":[124],"survey":[125],"covering":[126],"comparison":[128],"its":[132],"related":[133],"tasks,":[134],"existing":[136],"datasets,":[140],"in":[145],"AAC,":[146],"insights":[148],"provided":[149],"guide":[151],"potential":[152],"future":[153],"research":[154],"directions.":[155]},"counts_by_year":[{"year":2025,"cited_by_count":10},{"year":2024,"cited_by_count":5}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
