{"id":"https://openalex.org/W4396877837","doi":"https://doi.org/10.1109/taslp.2024.3399607","title":"AudioLDM 2: Learning Holistic Audio Generation With Self-Supervised Pretraining","display_name":"AudioLDM 2: Learning Holistic Audio Generation With Self-Supervised Pretraining","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4396877837","doi":"https://doi.org/10.1109/taslp.2024.3399607"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3399607","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3399607","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5089924305","display_name":"Haohe Liu","orcid":"https://orcid.org/0000-0003-1036-7888"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Haohe Liu","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K"],"affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100593867","display_name":"Yi Yuan","orcid":"https://orcid.org/0000-0002-6887-0956"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Yi Yuan","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K"],"affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106407730","display_name":"Xubo Liu","orcid":"https://orcid.org/0009-0004-9950-2672"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Xubo Liu","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K"],"affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070892237","display_name":"Xinhao Mei","orcid":"https://orcid.org/0000-0001-6079-5130"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Xinhao Mei","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K"],"affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072482416","display_name":"Qiuqiang Kong","orcid":"https://orcid.org/0000-0003-2864-0475"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Qiuqiang Kong","raw_affiliation_strings":["Department of Electronic Engineering, Chinese University of Hong Kong, Hong Kong, SAR, China"],"affiliations":[{"raw_affiliation_string":"Department of Electronic Engineering, Chinese University of Hong Kong, Hong Kong, SAR, China","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103162279","display_name":"Qiao Tian","orcid":"https://orcid.org/0000-0002-4078-1273"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiao Tian","raw_affiliation_strings":["Speech, Audio &amp; Music Intelligence (SAMI) Group, ByteDance Inc., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Speech, Audio &amp; Music Intelligence (SAMI) Group, ByteDance Inc., Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100339128","display_name":"Yu-Ping Wang","orcid":"https://orcid.org/0000-0003-4129-7704"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuping Wang","raw_affiliation_strings":["Speech, Audio &amp; Music Intelligence (SAMI) Group, ByteDance Inc., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Speech, Audio &amp; Music Intelligence (SAMI) Group, ByteDance Inc., Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100676721","display_name":"Wenwu Wang","orcid":"https://orcid.org/0000-0002-8393-5703"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Wenwu Wang","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K"],"affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100375990","display_name":"Yuxuan Wang","orcid":"https://orcid.org/0000-0003-1135-3605"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuxuan Wang","raw_affiliation_strings":["Speech, Audio &amp; Music Intelligence (SAMI) Group, ByteDance Inc., Beijing, China"],"affiliations":[{"raw_affiliation_string":"Speech, Audio &amp; Music Intelligence (SAMI) Group, ByteDance Inc., Beijing, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5066967599","display_name":"Mark D. Plumbley","orcid":"https://orcid.org/0000-0002-9708-1075"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Mark D. Plumbley","raw_affiliation_strings":["Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K"],"affiliations":[{"raw_affiliation_string":"Centre for Vision, Speech and Signal Processing (CVSSP), University of Surrey, Guilford, U.K","institution_ids":["https://openalex.org/I28290843"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5089924305"],"corresponding_institution_ids":["https://openalex.org/I28290843"],"apc_list":null,"apc_paid":null,"fwci":42.4994,"has_fulltext":false,"cited_by_count":123,"citation_normalized_percentile":{"value":0.99952298,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"32","issue":null,"first_page":"2871","last_page":"2883"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9926000237464905,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9926000237464905,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9829999804496765,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9656000137329102,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5002052783966064},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.3659588694572449}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5002052783966064},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.3659588694572449}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2024.3399607","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3399607","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.6100000143051147}],"awards":[{"id":"https://openalex.org/G7697158467","display_name":"AI for Sound","funder_award_id":"EP/T019751/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"},{"id":"https://openalex.org/G8857457899","display_name":null,"funder_award_id":"EP/T019751/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"}],"funders":[{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":103,"referenced_works":["https://openalex.org/W1560729591","https://openalex.org/W1959608418","https://openalex.org/W2052666245","https://openalex.org/W2066334462","https://openalex.org/W2187089797","https://openalex.org/W2526050071","https://openalex.org/W2559726422","https://openalex.org/W2593116425","https://openalex.org/W2973049979","https://openalex.org/W2984284833","https://openalex.org/W3015371781","https://openalex.org/W3046747294","https://openalex.org/W3094502228","https://openalex.org/W3127705815","https://openalex.org/W3155072588","https://openalex.org/W3198694222","https://openalex.org/W3201143670","https://openalex.org/W3203491020","https://openalex.org/W3205475937","https://openalex.org/W3209059054","https://openalex.org/W3215615641","https://openalex.org/W4224035735","https://openalex.org/W4224931676","https://openalex.org/W4226033575","https://openalex.org/W4288089799","https://openalex.org/W4303440777","https://openalex.org/W4312933868","https://openalex.org/W4313447020","https://openalex.org/W4318351475","https://openalex.org/W4318718630","https://openalex.org/W4319989813","https://openalex.org/W4362515116","https://openalex.org/W4367359628","https://openalex.org/W4372259760","https://openalex.org/W4372260310","https://openalex.org/W4372260340","https://openalex.org/W4372266890","https://openalex.org/W4372348103","https://openalex.org/W4375869413","https://openalex.org/W4376632781","https://openalex.org/W4378602476","https://openalex.org/W4378942405","https://openalex.org/W4379251869","https://openalex.org/W4381786045","https://openalex.org/W4385328213","https://openalex.org/W4386071707","https://openalex.org/W4387969125","https://openalex.org/W4389524500","https://openalex.org/W4391020683","https://openalex.org/W4393157029","https://openalex.org/W4393161149","https://openalex.org/W4400033239","https://openalex.org/W6610228593","https://openalex.org/W6633499030","https://openalex.org/W6633724138","https://openalex.org/W6640963894","https://openalex.org/W6728610325","https://openalex.org/W6732646663","https://openalex.org/W6739901393","https://openalex.org/W6757817989","https://openalex.org/W6769627184","https://openalex.org/W6769915798","https://openalex.org/W6771763809","https://openalex.org/W6777694618","https://openalex.org/W6778823374","https://openalex.org/W6779823529","https://openalex.org/W6780218876","https://openalex.org/W6782760101","https://openalex.org/W6783182287","https://openalex.org/W6783713337","https://openalex.org/W6783867762","https://openalex.org/W6786375611","https://openalex.org/W6795261426","https://openalex.org/W6795288823","https://openalex.org/W6797095309","https://openalex.org/W6799642162","https://openalex.org/W6802017037","https://openalex.org/W6802805937","https://openalex.org/W6809885388","https://openalex.org/W6810007534","https://openalex.org/W6810940779","https://openalex.org/W6838639034","https://openalex.org/W6838844135","https://openalex.org/W6840815571","https://openalex.org/W6844305113","https://openalex.org/W6845281891","https://openalex.org/W6845479124","https://openalex.org/W6847076894","https://openalex.org/W6848482659","https://openalex.org/W6849105126","https://openalex.org/W6849109464","https://openalex.org/W6849416043","https://openalex.org/W6849517043","https://openalex.org/W6849635556","https://openalex.org/W6850843143","https://openalex.org/W6851775633","https://openalex.org/W6852824296","https://openalex.org/W6852871851","https://openalex.org/W6852971826","https://openalex.org/W6853096648","https://openalex.org/W6853393314","https://openalex.org/W6855691466","https://openalex.org/W6917585676"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W4395014643"],"abstract_inverted_index":{"Although":[0],"audio":[1,48,83,117,130],"generation":[2,100,118],"shares":[3],"commonalities":[4],"across":[5],"different":[6],"types":[7],"of":[8,25,36,47,75,79,129,154,167],"audio,":[9,76],"such":[10,141],"as":[11,142],"speech,":[12,63],"music,":[13,64],"and":[14,28,65,113,157,180],"sound":[15,66],"effects,":[16],"designing":[17],"models":[18],"for":[19,62],"each":[20],"type":[21],"requires":[22],"careful":[23],"consideration":[24],"specific":[26],"objectives":[27],"biases":[29],"that":[30,56],"can":[31,84],"significantly":[32],"differ":[33],"from":[34],"those":[35],"other":[37,104],"types.":[38],"To":[39],"bring":[40],"us":[41],"closer":[42],"to":[43],"a":[44,53,72,92,110,121],"unified":[45],"perspective":[46],"generation,":[49],"this":[50],"paper":[51],"proposes":[52],"holistic":[54],"framework":[55,70,137,172],"utilizes":[57,71],"the":[58,99,127,151,168],"same":[59],"learning":[60,96,119],"method":[61],"effect":[67],"generation.":[68],"Our":[69,176],"general":[73],"representation":[74,95],"called":[77],"\u201clanguage":[78],"audio\u201d":[80],"(LOA).":[81],"Any":[82],"be":[85],"translated":[86],"into":[87,106],"LOA":[88,107,128],"based":[89],"on":[90,126,150],"AudioMAE,":[91],"self-supervised":[93,116,144],"pre-trained":[94],"model.":[97],"In":[98],"process,":[101],"we":[102,114],"translate":[103],"modalities":[105],"by":[108],"using":[109],"GPT-2":[111],"model,":[112,179],"perform":[115],"with":[120,159],"latent":[122,146],"diffusion":[123,147],"model":[124],"conditioned":[125],"in":[131],"our":[132],"training":[133],"set.":[134],"The":[135],"proposed":[136],"naturally":[138],"brings":[139],"advantages":[140],"reusable":[143],"pretrained":[145,178],"models.":[148],"Experiments":[149],"major":[152],"benchmarks":[153],"text-to-audio,":[155],"text-to-music,":[156],"text-to-speech":[158],"three":[160],"AudioLDM":[161,169],"2":[162,170],"variants":[163,171],"demonstrate":[164],"competitive":[165],"performance":[166],"against":[173],"previous":[174],"approaches.":[175],"code,":[177],"demo":[181],"are":[182],"available":[183],"at":[184],"<uri":[185],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[186],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">https://audioldm.github.io/audioldm2</uri>":[187],".":[188]},"counts_by_year":[{"year":2026,"cited_by_count":9},{"year":2025,"cited_by_count":87},{"year":2024,"cited_by_count":27}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
