{"id":"https://openalex.org/W4403674737","doi":"https://doi.org/10.1109/taslp.2024.3485485","title":"Auffusion: Leveraging the Power of Diffusion and Large Language Models for Text-to-Audio Generation","display_name":"Auffusion: Leveraging the Power of Diffusion and Large Language Models for Text-to-Audio Generation","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4403674737","doi":"https://doi.org/10.1109/taslp.2024.3485485"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3485485","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3485485","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5033565985","display_name":"Jinlong Xue","orcid":"https://orcid.org/0009-0000-0442-0932"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jinlong Xue","raw_affiliation_strings":["School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","School of Artificial Intelligence, Beijing University of Posts and Telecommunications (BUPT), Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]},{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications (BUPT), Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008597430","display_name":"Yayue Deng","orcid":"https://orcid.org/0009-0003-7642-4942"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yayue Deng","raw_affiliation_strings":["School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","School of Artificial Intelligence, Beijing University of Posts and Telecommunications (BUPT), Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]},{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications (BUPT), Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079377968","display_name":"Yingming Gao","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yingming Gao","raw_affiliation_strings":["School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","School of Artificial Intelligence, Beijing University of Posts and Telecommunications (BUPT), Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]},{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications (BUPT), Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100343662","display_name":"Ya Li","orcid":"https://orcid.org/0000-0002-6284-5039"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ya Li","raw_affiliation_strings":["School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","School of Artificial Intelligence, Beijing University of Posts and Telecommunications (BUPT), Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]},{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications (BUPT), Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5033565985"],"corresponding_institution_ids":["https://openalex.org/I139759216"],"apc_list":null,"apc_paid":null,"fwci":7.1871,"has_fulltext":false,"cited_by_count":21,"citation_normalized_percentile":{"value":0.97970538,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":"32","issue":null,"first_page":"4700","last_page":"4712"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9739000201225281,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9739000201225281,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.6551951169967651},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6000029444694519},{"id":"https://openalex.org/keywords/power","display_name":"Power (physics)","score":0.5064653158187866},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.06738495826721191}],"concepts":[{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.6551951169967651},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6000029444694519},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.5064653158187866},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.06738495826721191},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2024.3485485","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3485485","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.41999998688697815}],"awards":[{"id":"https://openalex.org/G2217149964","display_name":null,"funder_award_id":"2023RC73","funder_id":"https://openalex.org/F4320335787","funder_display_name":"Fundamental Research Funds for the Central Universities"},{"id":"https://openalex.org/G2249315591","display_name":null,"funder_award_id":"62271083","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7136399700","display_name":null,"funder_award_id":"2023RC13","funder_id":"https://openalex.org/F4320335787","funder_display_name":"Fundamental Research Funds for the Central Universities"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":57,"referenced_works":["https://openalex.org/W2038484192","https://openalex.org/W2052666245","https://openalex.org/W2133824856","https://openalex.org/W2526050071","https://openalex.org/W2593116425","https://openalex.org/W2752796333","https://openalex.org/W2896457183","https://openalex.org/W2935170919","https://openalex.org/W3015591594","https://openalex.org/W3094550259","https://openalex.org/W3162331882","https://openalex.org/W3162999565","https://openalex.org/W3163162786","https://openalex.org/W3191850102","https://openalex.org/W3196464216","https://openalex.org/W3197032408","https://openalex.org/W3199003182","https://openalex.org/W3212516020","https://openalex.org/W3214281017","https://openalex.org/W4225303417","https://openalex.org/W4307323391","https://openalex.org/W4312933868","https://openalex.org/W4367359628","https://openalex.org/W4372260340","https://openalex.org/W4372266552","https://openalex.org/W4385245566","https://openalex.org/W4385569875","https://openalex.org/W4386076532","https://openalex.org/W4387969125","https://openalex.org/W4390873054","https://openalex.org/W4396877837","https://openalex.org/W4400033239","https://openalex.org/W6679045638","https://openalex.org/W6757817989","https://openalex.org/W6765779288","https://openalex.org/W6766673545","https://openalex.org/W6769627184","https://openalex.org/W6779823529","https://openalex.org/W6783867762","https://openalex.org/W6783906718","https://openalex.org/W6790978476","https://openalex.org/W6791353385","https://openalex.org/W6809885388","https://openalex.org/W6810940779","https://openalex.org/W6838639034","https://openalex.org/W6838843145","https://openalex.org/W6840155194","https://openalex.org/W6840200333","https://openalex.org/W6840815571","https://openalex.org/W6841366371","https://openalex.org/W6845479124","https://openalex.org/W6846655393","https://openalex.org/W6847076894","https://openalex.org/W6849109464","https://openalex.org/W6849416043","https://openalex.org/W6852971826","https://openalex.org/W6853927091"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Recent":[0],"advancements":[1],"in":[2,132,155,180,201,213],"diffusion":[3,59,124,193],"models":[4,8],"and":[5,45,81,87,100,114,147,163,184,223],"large":[6],"language":[7,30],"(LLMs)":[9],"have":[10],"significantly":[11],"propelled":[12],"the":[13,104,123,135,173,177,181,192],"field":[14,183],"of":[15,138],"generation":[16,22,43],"tasks.":[17],"Text-to-Audio":[18],"(TTA),":[19],"a":[20,64,109],"burgeoning":[21],"application":[23],"designed":[24],"to":[25,71,126,175],"generate":[26,127],"audio":[27,219],"from":[28,55],"natural":[29],"prompts,":[31],"is":[32,153,210],"attracting":[33],"increasing":[34],"attention.":[35],"However,":[36],"existing":[37],"TTA":[38,65,72,95,157,182],"studies":[39,131,162],"often":[40],"struggle":[41],"with":[42],"quality":[44],"text-audio":[46],"alignment,":[47,143],"especially":[48],"for":[49,122],"complex":[50],"textual":[51,207],"inputs.":[52],"Drawing":[53],"inspiration":[54],"state-of-the-art":[56],"Text-to-Image":[57],"(T2I)":[58],"models,":[60],"we":[61,168],"introduce":[62],"Auffusion,":[63],"system":[66],"adapting":[67],"T2I":[68,133],"model":[69,125],"frameworks":[70],"task,":[73],"by":[74],"effectively":[75],"leveraging":[76],"their":[77],"inherent":[78],"generative":[79],"strengths":[80],"precise":[82],"cross-modal":[83,142],"alignment.":[84],"Our":[85,195],"objective":[86],"subjective":[88],"evaluations":[89],"demonstrate":[90],"that":[91,204],"Auffusion":[92],"surpasses":[93],"previous":[94],"approaches":[96],"using":[97],"limited":[98],"data":[99],"computational":[101],"resources.":[102],"Furthermore,":[103],"text":[105,113,189],"encoder":[106,139],"serves":[107],"as":[108,119,218],"critical":[110],"bridge":[111],"between":[112],"audio,":[115],"since":[116],"it":[117],"acts":[118],"an":[120],"instruction":[121],"coherent":[128],"content.":[129],"Previous":[130],"recognize":[134],"significant":[136],"impact":[137],"choice":[140],"on":[141],"like":[144],"fine-grained":[145],"details":[146],"object":[148],"bindings,":[149],"while":[150],"similar":[151],"evaluation":[152],"lacking":[154],"prior":[156],"works.":[158],"Through":[159],"comprehensive":[160],"ablation":[161],"innovative":[164],"cross-attention":[165],"map":[166],"visualizations,":[167],"provide":[169],"insightful":[170],"assessments,":[171],"being":[172],"first":[174],"reveal":[176,197],"internal":[178],"mechanisms":[179],"intuitively":[185],"explain":[186],"how":[187],"different":[188],"encoders":[190],"influence":[191],"process.":[194],"findings":[196],"Auffusion's":[198],"superior":[199],"capability":[200],"generating":[202],"audios":[203],"accurately":[205],"match":[206],"descriptions,":[208],"which":[209],"further":[211],"demonstrated":[212],"several":[214],"related":[215],"tasks,":[216],"such":[217],"style":[220],"transfer,":[221],"inpainting,":[222],"other":[224],"manipulations.":[225]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":16},{"year":2024,"cited_by_count":1}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
