{"id":"https://openalex.org/W4408352941","doi":"https://doi.org/10.1109/icassp49660.2025.10890325","title":"DRCap: Decoding CLAP Latents with Retrieval-Augmented Generation for Zero-shot Audio Captioning","display_name":"DRCap: Decoding CLAP Latents with Retrieval-Augmented Generation for Zero-shot Audio Captioning","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408352941","doi":"https://doi.org/10.1109/icassp49660.2025.10890325"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10890325","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890325","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5104326367","display_name":"Xiquan Li","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xiquan Li","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090739497","display_name":"Wenxi Chen","orcid":"https://orcid.org/0000-0002-7938-9033"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenxi Chen","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100438492","display_name":"Ziyang Ma","orcid":"https://orcid.org/0000-0002-0623-9114"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ziyang Ma","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025827045","display_name":"Xuenan Xu","orcid":"https://orcid.org/0000-0001-8718-1278"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuenan Xu","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114117046","display_name":"Yuzhe Liang","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuzhe Liang","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068357345","display_name":"Zhisheng Zheng","orcid":"https://orcid.org/0000-0001-7761-9790"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhisheng Zheng","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab,China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072482416","display_name":"Qiuqiang Kong","orcid":"https://orcid.org/0000-0003-2864-0475"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qiuqiang Kong","raw_affiliation_strings":["The Chinese University of Hong Kong,Department of Electronics Engineering,China"],"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong,Department of Electronics Engineering,China","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101293966","display_name":"Xie Chen","orcid":"https://orcid.org/0009-0004-4458-0753"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xie Chen","raw_affiliation_strings":["Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab,China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, X-LANCE Lab,China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5104326367"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":6.3913,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.96247952,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9944999814033508,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9715999960899353,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.927249550819397},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.8285906314849854},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7239619493484497},{"id":"https://openalex.org/keywords/shot","display_name":"Shot (pellet)","score":0.6514243483543396},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5400110483169556},{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.5167850255966187},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.36230728030204773},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.1431831419467926},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.11767226457595825},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.04359900951385498}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.927249550819397},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.8285906314849854},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7239619493484497},{"id":"https://openalex.org/C2778344882","wikidata":"https://www.wikidata.org/wiki/Q278938","display_name":"Shot (pellet)","level":2,"score":0.6514243483543396},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5400110483169556},{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.5167850255966187},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36230728030204773},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.1431831419467926},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.11767226457595825},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.04359900951385498},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10890325","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890325","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W2593116425","https://openalex.org/W3015591594","https://openalex.org/W3034999214","https://openalex.org/W3094550259","https://openalex.org/W3162999565","https://openalex.org/W3205860970","https://openalex.org/W4226442948","https://openalex.org/W4280567182","https://openalex.org/W4372260310","https://openalex.org/W4372266552","https://openalex.org/W4372340819","https://openalex.org/W4379251221","https://openalex.org/W4387609319","https://openalex.org/W4392902953","https://openalex.org/W4392903033","https://openalex.org/W4392903210","https://openalex.org/W4392903801","https://openalex.org/W4392909554","https://openalex.org/W4400033239","https://openalex.org/W4401023891","https://openalex.org/W4408353237","https://openalex.org/W4410153385","https://openalex.org/W6631190155","https://openalex.org/W6679436768","https://openalex.org/W6766673545","https://openalex.org/W6777615688","https://openalex.org/W6796581206","https://openalex.org/W6810452849","https://openalex.org/W6851149231","https://openalex.org/W6856322222","https://openalex.org/W6856875298","https://openalex.org/W6858827458"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W2775506363","https://openalex.org/W3088136942","https://openalex.org/W2963177403","https://openalex.org/W4290852288","https://openalex.org/W2949362007","https://openalex.org/W4283207562","https://openalex.org/W2330246314","https://openalex.org/W2949522393","https://openalex.org/W4289422896"],"abstract_inverted_index":{"While":[0],"automated":[1],"audio":[2,47,108,114,149],"captioning":[3,48],"(AAC)":[4],"has":[5],"made":[6],"notable":[7],"progress,":[8],"traditional":[9],"fully":[10],"supervised":[11],"AAC":[12],"models":[13,264],"still":[14],"face":[15],"two":[16],"critical":[17],"challenges:":[18],"the":[19,85,88,101,107,122,126,132,136,140,145,165,172,187,204,209,213,229,234,238],"need":[20],"for":[21,26,54,113],"expensive":[22],"audio-text":[23],"pair":[24],"data":[25,53],"training":[27,55],"and":[28,44,56,74,139,208,222,233,268],"performance":[29,271],"degradation":[30],"when":[31],"transferring":[32],"across":[33],"domains.":[34],"To":[35,120],"overcome":[36],"these":[37],"limitations,":[38],"we":[39,129],"present":[40],"DRCap,":[41],"a":[42,68,75,92,117,155,179,219,243,252],"data-efficient":[43],"flexible":[45],"zero-shot":[46,118,263],"system":[49],"that":[50,258],"requires":[51],"text-only":[52],"can":[57],"quickly":[58],"adapt":[59,247],"to":[60,110,159,185,192,217,237,246,248],"new":[61,249],"domains":[62,250],"without":[63],"additional":[64],"fine-tuning.":[65],"DRCap":[66,241,259],"integrates":[67],"contrastive":[69],"language-audio":[70],"pre-training":[71],"(CLAP)":[72],"model":[73,78,86,214],"large":[76],"language":[77],"(LLM)":[79],"as":[80,183],"its":[81,197],"backbone.":[82],"During":[83],"training,":[84],"predicts":[87],"ground-truth":[89],"caption":[90,235],"with":[91,106],"fixed":[93],"text":[94,102,156,230],"encoder":[95,103,109,137],"from":[96,135,144,178],"CLAP,":[97],"whereas,":[98],"during":[99],"inference,":[100],"is":[104,215],"replaced":[105],"generate":[111],"captions":[112,176],"clips":[115],"in":[116,251,265,272],"manner.":[119,254],"mitigate":[121],"modality":[123],"gap":[124],"of":[125,169,196],"CLAP":[127,206],"model,":[128],"use":[130],"both":[131,203],"projection":[133],"strategy":[134,143],"side":[138],"retrieval-augmented":[141],"generation":[142],"decoder":[146],"side.":[147],"Specifically,":[148],"embeddings":[150],"are":[151,181],"first":[152],"projected":[153,205],"onto":[154],"embedding":[157,207,231],"support":[158,232],"absorb":[160],"extensive":[161],"semantic":[162],"information":[163],"within":[164],"joint":[166],"multi-modal":[167],"space":[168],"CLAP.":[170],"At":[171],"same":[173],"time,":[174],"similar":[175,211],"retrieved":[177,210],"datastore":[180,236],"fed":[182],"prompts":[184],"instruct":[186],"LLM,":[188],"incorporating":[189],"external":[190],"knowledge":[191],"take":[193],"full":[194],"advantage":[195],"strong":[198],"generative":[199],"capability.":[200],"Conditioned":[201],"on":[202],"captions,":[212],"able":[216],"produce":[218],"more":[220],"accurate":[221],"semantically":[223],"rich":[224],"textual":[225],"description.":[226],"By":[227],"tailoring":[228],"target":[239],"domain,":[240],"acquires":[242],"robust":[244],"ability":[245],"training-free":[253],"Experimental":[255],"results":[256],"demonstrate":[257],"outperforms":[260],"all":[261],"other":[262],"in-domain":[266],"scenarios":[267],"achieves":[269],"state-of-the-art":[270],"cross-domain":[273],"scenarios.":[274]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":3}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
