{"id":"https://openalex.org/W7138981111","doi":"https://doi.org/10.48550/arxiv.2603.16280","title":"CAST-TTS: A Simple Cross-Attention Framework for Unified Timbre Control in TTS","display_name":"CAST-TTS: A Simple Cross-Attention Framework for Unified Timbre Control in TTS","publication_year":2026,"publication_date":"2026-03-17","ids":{"openalex":"https://openalex.org/W7138981111","doi":"https://doi.org/10.48550/arxiv.2603.16280"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.16280","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.16280","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.16280","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129943196","display_name":"Zihao Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zheng, Zihao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130061014","display_name":"Wen Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Wen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130043297","display_name":"Chao Ying Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Chao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130164447","display_name":"Mengyue Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Mengyue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129959020","display_name":"Xuenan Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Xuenan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5129943196"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.4016000032424927,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.4016000032424927,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.17470000684261322,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.12359999865293503,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/timbre","display_name":"Timbre","score":0.8906000256538391},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.6588000059127808},{"id":"https://openalex.org/keywords/unified-model","display_name":"Unified Model","score":0.5551999807357788},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.526199996471405},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5256999731063843},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.3747999966144562}],"concepts":[{"id":"https://openalex.org/C2776539107","wikidata":"https://www.wikidata.org/wiki/Q176501","display_name":"Timbre","level":3,"score":0.8906000256538391},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8001000285148621},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.6588000059127808},{"id":"https://openalex.org/C45493050","wikidata":"https://www.wikidata.org/wiki/Q7884934","display_name":"Unified Model","level":2,"score":0.5551999807357788},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5519999861717224},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.526199996471405},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5256999731063843},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4187000095844269},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.3747999966144562},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.31450000405311584},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.29120001196861267},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.27900001406669617},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2648000121116638}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.16280","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.16280","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.16280","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.16280","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Current":[0],"Text-to-Speech":[1],"(TTS)":[2],"systems":[3],"typically":[4],"use":[5,93],"separate":[6],"models":[7,123],"for":[8,51,112],"speech-prompted":[9],"and":[10,36,61,75],"text-prompted":[11],"timbre":[12,53],"control.":[13,54],"While":[14],"unifying":[15],"both":[16],"control":[17,99],"signals":[18],"into":[19],"a":[20,46,80,127],"single":[21,85],"model":[22,91],"is":[23,110],"desirable,":[24],"the":[25,73,90,100,106],"challenge":[26],"of":[27,95],"cross-modal":[28],"alignment":[29],"often":[30],"results":[31],"in":[32],"overly":[33],"complex":[34],"architectures":[35],"training":[37,69],"objective.":[38],"To":[39],"address":[40],"this":[41],"challenge,":[42],"we":[43],"propose":[44],"CAST-TTS,":[45],"simple":[47],"yet":[48],"effective":[49],"framework":[50],"unified":[52,107,128],"Features":[55],"are":[56],"extracted":[57],"from":[58],"speech":[59,74],"prompts":[60,63],"text":[62,77],"using":[64],"pre-trained":[65],"encoders.":[66],"The":[67,130],"multi-stage":[68],"strategy":[70],"efficiently":[71],"aligns":[72],"projected":[76],"representations":[78,97],"within":[79,126],"shared":[81],"embedding":[82],"space.":[83],"A":[84],"cross-attention":[86,108],"mechanism":[87,109],"then":[88],"allows":[89],"to":[92,98,120],"either":[94],"these":[96],"timbre.":[101],"Extensive":[102],"experiments":[103],"validate":[104],"that":[105],"critical":[111],"achieving":[113],"high-quality":[114],"synthesis.":[115],"CAST-TTS":[116],"achieves":[117],"performance":[118],"comparable":[119],"specialized":[121],"single-input":[122],"while":[124],"operating":[125],"architecture.":[129],"demo":[131],"page":[132],"can":[133],"be":[134],"accessed":[135],"at":[136],"https://HiRookie9.github.io/CAST-TTS-Page.":[137]},"counts_by_year":[],"updated_date":"2026-03-20T20:54:20.808490","created_date":"2026-03-20T00:00:00"}
