{"id":"https://openalex.org/W4412886987","doi":"https://doi.org/10.18653/v1/2025.acl-industry.51","title":"SpeechWeave: Diverse Multilingual Synthetic Text &amp; Audio Data Generation Pipeline for Training Text to Speech Models","display_name":"SpeechWeave: Diverse Multilingual Synthetic Text &amp; Audio Data Generation Pipeline for Training Text to Speech Models","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4412886987","doi":"https://doi.org/10.18653/v1/2025.acl-industry.51"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2025.acl-industry.51","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-industry.51","pdf_url":"https://aclanthology.org/2025.acl-industry.51.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.acl-industry.51.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5119180943","display_name":"Karan Dua","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Karan Dua","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030196031","display_name":"Puneet Mittal","orcid":"https://orcid.org/0000-0001-6121-5687"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Puneet Mittal","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Ranjeet Gupta","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ranjeet Gupta","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5114200451","display_name":"Hitesh Laxmichand Patel","orcid":"https://orcid.org/0009-0009-7492-5173"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hitesh Laxmichand Patel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5119180943"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.9995,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.88942522,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"718","last_page":"737"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9333999752998352,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9333999752998352,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8061157464981079},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.8034890294075012},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6143711805343628},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5715534090995789},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5173465609550476},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.47526687383651733},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4508481025695801},{"id":"https://openalex.org/keywords/data-modeling","display_name":"Data modeling","score":0.4240078330039978},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.09803470969200134},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.055863916873931885}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8061157464981079},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.8034890294075012},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6143711805343628},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5715534090995789},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5173465609550476},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.47526687383651733},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4508481025695801},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.4240078330039978},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.09803470969200134},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.055863916873931885}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.18653/v1/2025.acl-industry.51","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-industry.51","pdf_url":"https://aclanthology.org/2025.acl-industry.51.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2509.14270","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.14270","pdf_url":"https://arxiv.org/pdf/2509.14270","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.acl-industry.51","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-industry.51","pdf_url":"https://aclanthology.org/2025.acl-industry.51.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.6200000047683716,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320307904","display_name":"Oracle","ror":"https://ror.org/006c77m33"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4412886987.pdf","grobid_xml":"https://content.openalex.org/works/W4412886987.grobid-xml"},"referenced_works_count":1,"referenced_works":["https://openalex.org/W3095410713"],"related_works":["https://openalex.org/W3037187668","https://openalex.org/W4234772502","https://openalex.org/W2380685755","https://openalex.org/W2252100032","https://openalex.org/W2963436428","https://openalex.org/W4400978025","https://openalex.org/W2918743509","https://openalex.org/W2734796617","https://openalex.org/W4285322112","https://openalex.org/W4292794239"],"abstract_inverted_index":{"High-quality":[0],"Text-to-Speech":[1],"(TTS)":[2],"model":[3],"training":[4,58,123],"requires":[5],"extensive":[6],"and":[7,10,29,73,144,171],"diverse":[8,137],"text":[9,43,61],"speech":[11,90,107,150],"data.It":[12],"is":[13,60,79,112,134],"challenging":[14],"to":[15,23,82],"procure":[16],"such":[17],"data":[18,59,76,108,132,163],"from":[19],"real":[20],"sources":[21],"due":[22],"issues":[24],"of":[25,114,118],"domain":[26],"specificity,":[27],"licensing,":[28],"scalability.Large":[30],"language":[31],"models":[32],"(LLMs)":[33],"can":[34],"certainly":[35],"generate":[36],"textual":[37],"data,":[38],"but":[39],"they":[40],"create":[41],"repetitive":[42],"with":[44,96,148],"insufficient":[45],"variation":[46],"in":[47,56,92,174],"the":[48,51,116,139,175],"prompt":[49],"during":[50],"generation":[52,109,117,164],"process.Another":[53],"important":[54],"aspect":[55],"TTS":[57,94,124,166],"normalization.Tools":[62],"for":[63,87,122,165],"normalization":[64],"might":[65],"occasionally":[66],"introduce":[67],"anomalies":[68],"or":[69],"overlook":[70],"valuable":[71],"patterns,":[72],"thus":[74],"impact":[75],"quality.Furthermore,":[77],"it":[78],"also":[80],"impractical":[81],"rely":[83],"on":[84],"voice":[85,172],"artists":[86],"large":[88],"scale":[89],"recording":[91],"commercial":[93],"systems":[95],"standardized":[97],"voices.To":[98],"address":[99],"these":[100],"challenges,":[101],"we":[102],"propose":[103],"SpeechWeave,":[104],"a":[105],"synthetic":[106],"pipeline":[110,130],"that":[111,128,133],"capable":[113],"automating":[115],"multilingual,":[119],"domainspecific":[120],"datasets":[121],"models.Our":[125],"experiments":[126],"reveal":[127],"our":[129],"generates":[131],"10-48%":[135],"more":[136],"than":[138],"baseline":[140],"across":[141],"various":[142],"linguistic":[143],"phonetic":[145],"metrics,":[146],"along":[147],"speaker-standardized":[149],"audio":[151],"while":[152],"generating":[153],"approximately":[154],"97%":[155],"correctly":[156],"normalized":[157],"text.Our":[158],"approach":[159],"enables":[160],"scalable,":[161],"high-quality":[162],"training,":[167],"improving":[168],"diversity,":[169],"normalization,":[170],"consistency":[173],"generated":[176],"datasets.":[177]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-05-13T08:25:38.343686","created_date":"2025-08-04T00:00:00"}
