{"id":"https://openalex.org/W4387969125","doi":"https://doi.org/10.1145/3581783.3612348","title":"Text-to-Audio Generation using Instruction Guided Latent Diffusion Model","display_name":"Text-to-Audio Generation using Instruction Guided Latent Diffusion Model","publication_year":2023,"publication_date":"2023-10-26","ids":{"openalex":"https://openalex.org/W4387969125","doi":"https://doi.org/10.1145/3581783.3612348"},"language":"en","primary_location":{"id":"doi:10.1145/3581783.3612348","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3612348","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5086877005","display_name":"Deepanway Ghosal","orcid":"https://orcid.org/0000-0002-3858-4449"},"institutions":[{"id":"https://openalex.org/I152815399","display_name":"Singapore University of Technology and Design","ror":"https://ror.org/05j6fvn87","country_code":"SG","type":"education","lineage":["https://openalex.org/I152815399"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Deepanway Ghosal","raw_affiliation_strings":["Singapore University of Technology and Design, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Singapore University of Technology and Design, Singapore, Singapore","institution_ids":["https://openalex.org/I152815399"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007003459","display_name":"Navonil Majumder","orcid":"https://orcid.org/0000-0002-1449-617X"},"institutions":[{"id":"https://openalex.org/I152815399","display_name":"Singapore University of Technology and Design","ror":"https://ror.org/05j6fvn87","country_code":"SG","type":"education","lineage":["https://openalex.org/I152815399"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Navonil Majumder","raw_affiliation_strings":["Singapore University of Technology and Design, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Singapore University of Technology and Design, Singapore, Singapore","institution_ids":["https://openalex.org/I152815399"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065572305","display_name":"Ambuj Mehrish","orcid":"https://orcid.org/0000-0003-4240-9915"},"institutions":[{"id":"https://openalex.org/I152815399","display_name":"Singapore University of Technology and Design","ror":"https://ror.org/05j6fvn87","country_code":"SG","type":"education","lineage":["https://openalex.org/I152815399"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Ambuj Mehrish","raw_affiliation_strings":["Singapore University of Technology and Design, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Singapore University of Technology and Design, Singapore, Singapore","institution_ids":["https://openalex.org/I152815399"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103285591","display_name":"Soujanya Poria","orcid":null},"institutions":[{"id":"https://openalex.org/I152815399","display_name":"Singapore University of Technology and Design","ror":"https://ror.org/05j6fvn87","country_code":"SG","type":"education","lineage":["https://openalex.org/I152815399"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Soujanya Poria","raw_affiliation_strings":["Singapore University of Technology and Design, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Singapore University of Technology and Design, Singapore, Singapore","institution_ids":["https://openalex.org/I152815399"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5086877005"],"corresponding_institution_ids":["https://openalex.org/I152815399"],"apc_list":null,"apc_paid":null,"fwci":14.3897,"has_fulltext":false,"cited_by_count":73,"citation_normalized_percentile":{"value":0.99393639,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"3590","last_page":"3598"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9961000084877014,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8396691083908081},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6976227760314941},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6103180646896362},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5345871448516846},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5271144509315491},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5176354050636292},{"id":"https://openalex.org/keywords/test-set","display_name":"Test set","score":0.4814590811729431},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4481113851070404},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4401409327983856},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.09349048137664795}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8396691083908081},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6976227760314941},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6103180646896362},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5345871448516846},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5271144509315491},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5176354050636292},{"id":"https://openalex.org/C169903167","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Test set","level":2,"score":0.4814590811729431},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4481113851070404},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4401409327983856},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.09349048137664795},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3581783.3612348","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3612348","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"},{"id":"pmh:oai:iris.unive.it:10278/5105948","is_oa":false,"landing_page_url":"https://dl.acm.org/doi/abs/10.1145/3581783.3612348","pdf_url":null,"source":{"id":"https://openalex.org/S4306402336","display_name":"ARCA (Universit\u00e0 Ca' Foscari Venezia)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I149461666","host_organization_name":"Ca' Foscari University of Venice","host_organization_lineage":["https://openalex.org/I149461666"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"info:eu-repo/semantics/conferenceObject"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.8399999737739563,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G3761073186","display_name":null,"funder_award_id":"T2MOE2008","funder_id":"https://openalex.org/F4320320751","funder_display_name":"Ministry of Education - Singapore"},{"id":"https://openalex.org/G6584606195","display_name":null,"funder_award_id":"AcRF Tier-2 grant","funder_id":"https://openalex.org/F4320320751","funder_display_name":"Ministry of Education - Singapore"}],"funders":[{"id":"https://openalex.org/F4320320751","display_name":"Ministry of Education - Singapore","ror":"https://ror.org/01kcva023"},{"id":"https://openalex.org/F4320322724","display_name":"Ministry of Education, India","ror":"https://ror.org/048xjjh50"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1901129140","https://openalex.org/W2038484192","https://openalex.org/W2052666245","https://openalex.org/W2133824856","https://openalex.org/W2552465644","https://openalex.org/W2593116425","https://openalex.org/W2595551253","https://openalex.org/W2768188490","https://openalex.org/W2965373594","https://openalex.org/W2972478942","https://openalex.org/W2981852735","https://openalex.org/W3162673269","https://openalex.org/W3184410885","https://openalex.org/W3199384252","https://openalex.org/W4221144097","https://openalex.org/W4224035735","https://openalex.org/W4281485151","https://openalex.org/W4288089799","https://openalex.org/W4297683907","https://openalex.org/W4307079201","https://openalex.org/W4312437435","https://openalex.org/W4312933868","https://openalex.org/W4318752004","https://openalex.org/W4385571886","https://openalex.org/W4390873030","https://openalex.org/W6783713337","https://openalex.org/W6790978476","https://openalex.org/W6840815571","https://openalex.org/W6845479124"],"related_works":["https://openalex.org/W2181948922","https://openalex.org/W2384362569","https://openalex.org/W4205302943","https://openalex.org/W2119949815","https://openalex.org/W2561132942","https://openalex.org/W2142795561","https://openalex.org/W3155418658","https://openalex.org/W2379948177","https://openalex.org/W3098003361","https://openalex.org/W2342291550"],"abstract_inverted_index":{"The":[0,67],"immense":[1],"scale":[2],"of":[3,136],"the":[4,47,56,95,105,113,123,134,147],"recent":[5],"large":[6],"language":[7,31],"models":[8],"(LLM)":[9],"allows":[10],"many":[11,29],"interesting":[12],"properties,":[13],"such":[14,37,41,83],"as,":[15,84],"instruction-":[16],"and":[17,25,101,121],"chain-of-thought-based":[18],"fine-tuning,":[19],"that":[20],"has":[21],"significantly":[22],"improved":[23],"zero-":[24],"few-shot":[26],"performance":[27],"in":[28],"natural":[30],"processing":[32],"(NLP)":[33],"tasks.":[34],"Inspired":[35],"by":[36],"successes,":[38],"we":[39],"adopt":[40],"an":[42,61],"instruction-tuned":[43],"LLM":[44],"Flan-T5":[45],"as":[46],"text":[48,124],"encoder":[49,77,125],"for":[50,142],"text-to-audio":[51],"(TTA)":[52],"generation-a":[53],"task":[54],"where":[55],"goal":[57],"is":[58],"to":[59,133],"generate":[60],"audio":[62,137],"from":[63],"its":[64],"textual":[65],"description.":[66],"prior":[68,148],"works":[69],"on":[70,98,104,107,115],"TTA":[71],"either":[72],"pre-trained":[73],"a":[74,80,116,151],"joint":[75],"text-audio":[76],"or":[78],"used":[79],"non-instruction-tuned":[81],"model,":[82],"T5.":[85],"Consequently,":[86],"our":[87],"latent":[88],"diffusion":[89],"model":[90],"(LDM)-based":[91],"approach":[92],"(Tango)":[93],"outperforms":[94],"state-of-the-art":[96],"AudioLDM":[97],"most":[99],"metrics":[100],"stays":[102],"comparable":[103],"rest":[106],"AudioCaps":[108],"test":[109],"set,":[110],"despite":[111],"training":[112,143],"LDM":[114],"63":[117],"times":[118],"smaller":[119],"dataset":[120],"keeping":[122],"frozen.":[126],"This":[127],"improvement":[128],"might":[129],"also":[130],"be":[131],"attributed":[132],"adoption":[135],"pressure":[138],"level-based":[139],"sound":[140],"mixing":[141],"set":[144],"augmentation,":[145],"whereas":[146],"methods":[149],"take":[150],"random":[152],"mix.":[153]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":43},{"year":2024,"cited_by_count":27}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
