{"id":"https://openalex.org/W6967783044","doi":"https://doi.org/10.5281/zenodo.12798294","title":"ConsistencyTTA: Accelerating Diffusion-Based Text-to-Audio Generation with Consistency Distillation","display_name":"ConsistencyTTA: Accelerating Diffusion-Based Text-to-Audio Generation with Consistency Distillation","publication_year":2024,"publication_date":"2024-07-23","ids":{"openalex":"https://openalex.org/W6967783044","doi":"https://doi.org/10.5281/zenodo.12798294"},"language":"en","primary_location":{"id":"doi:10.5281/zenodo.12798294","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.12798294","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"other","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.5281/zenodo.12798294","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Bai, Yatong","orcid":null},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Bai, Yatong","raw_affiliation_strings":["University of California, Berkeley"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of California, Berkeley","institution_ids":["https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Dang, Trung","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dang, Trung","raw_affiliation_strings":["Microsoft (United States)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Microsoft (United States)","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Tran, Dung","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tran, Dung","raw_affiliation_strings":["Microsoft (United States)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Microsoft (United States)","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Koishida, Kazuhito","orcid":null},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Koishida, Kazuhito","raw_affiliation_strings":["Microsoft (United States)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Microsoft (United States)","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"last","author":{"id":null,"display_name":"Sojoudi, Somayeh","orcid":"https://orcid.org/0000-0001-7177-7712"},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sojoudi, Somayeh","raw_affiliation_strings":["University of California, Berkeley"],"raw_orcid":"https://orcid.org/0000-0001-7177-7712","affiliations":[{"raw_affiliation_string":"University of California, Berkeley","institution_ids":["https://openalex.org/I95457486"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I95457486"],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":null,"topics":[],"keywords":[{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.6462000012397766},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5958999991416931},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4717000126838684},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4032000005245209},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.3425999879837036},{"id":"https://openalex.org/keywords/noise-reduction","display_name":"Noise reduction","score":0.31450000405311584}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.70169997215271},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.6462000012397766},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5958999991416931},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4717000126838684},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4032000005245209},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36340001225471497},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.3425999879837036},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3249000012874603},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.31450000405311584},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.3059000074863434},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.2996000051498413},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.2831000089645386},{"id":"https://openalex.org/C2780069185","wikidata":"https://www.wikidata.org/wiki/Q7977945","display_name":"Equivalence (formal languages)","level":2,"score":0.2648000121116638}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5281/zenodo.12798294","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.12798294","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.5281/zenodo.12798294","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.12798294","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Diffusion":[0],"models":[1,13,41,92],"power":[2],"a":[3,16,45,76],"vast":[4],"majority":[5],"of":[6,55],"the":[7,24,52,85,128,133,137,149],"text-to-audio":[8,40],"generation":[9,53,74,97,106,150],"methods.":[10],"Unfortunately,":[11],"diffusion":[12,94,102],"suffer":[14],"from":[15],"slow":[17],"inference":[18],"speed":[19],"due":[20],"to":[21,118],"iteratively":[22],"querying":[23],"underlying":[25],"denoising":[26],"network,":[27],"thus":[28],"unsuitable":[29],"for":[30],"applications":[31],"with":[32,122],"time":[33],"or":[34],"computational":[35],"constraints.":[36],"This":[37],"work":[38],"proposes":[39],"that":[42,144],"only":[43],"require":[44],"single":[46],"non-autoregressive":[47],"neural":[48],"network":[49],"query,":[50],"accelerating":[51],"hundreds":[54],"times":[56],"and":[57,79,99],"enabling":[58],"on-device":[59],"audio":[60,110],"generation.":[61],"To":[62],"achieve":[63],"this,":[64],"we":[65],"propose":[66],"\"CFG-aware":[67],"latent":[68,77],"consistency":[69,73],"model'',":[70],"which":[71],"moves":[72],"into":[75,84],"space":[78],"incorporates":[80],"classifier-free":[81],"guidance":[82],"(CFG)":[83],"training":[86],"process.":[87],"By":[88],"doing":[89],"so,":[90],"our":[91],"retain":[93],"models'":[95],"impressive":[96],"quality":[98],"diversity.":[100],"Unlike":[101],"models,":[103],"ConsistencyTTA's":[104],"single-step":[105],"makes":[107],"its":[108],"generated":[109],"available":[111],"during":[112],"training.":[113],"We":[114,135],"leverage":[115],"this":[116],"advantage":[117],"finetune":[119],"ConsistencyTTA":[120],"end-to-end":[121,145],"audio-space":[123],"text-aware":[124],"metrics,":[125],"such":[126],"as":[127,140],"CLAP":[129,138],"score,":[130],"further":[131,147],"enhancing":[132],"generations.":[134],"use":[136],"loss":[139],"an":[141],"example,":[142],"confirming":[143],"fine-tuning":[146],"boosts":[148],"quality.":[151]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
