{"id":"https://openalex.org/W7162632726","doi":"https://doi.org/10.48550/arxiv.2605.27376","title":"Unlocking Fine-Grained and Within-Utterance Speaking Style Control in Prompt-Based Text-to-Speech Models","display_name":"Unlocking Fine-Grained and Within-Utterance Speaking Style Control in Prompt-Based Text-to-Speech Models","publication_year":2026,"publication_date":"2026-04-09","ids":{"openalex":"https://openalex.org/W7162632726","doi":"https://doi.org/10.48550/arxiv.2605.27376"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.27376","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27376","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.27376","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137267241","display_name":"Jaehoon Kang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kang, Jaehoon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137244435","display_name":"Yejin Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Yejin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128379284","display_name":"Yoonji Park","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Park, Yoonji","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137264106","display_name":"Kyuhong Shim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shim, Kyuhong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.7904999852180481,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.7904999852180481,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.05849999934434891,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.03629999980330467,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.6499000191688538},{"id":"https://openalex.org/keywords/interpolation","display_name":"Interpolation (computer graphics)","score":0.5095999836921692},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.4828999936580658},{"id":"https://openalex.org/keywords/smoothness","display_name":"Smoothness","score":0.47760000824928284},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.46299999952316284},{"id":"https://openalex.org/keywords/realization","display_name":"Realization (probability)","score":0.43880000710487366},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.4350000023841858},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.4138000011444092}],"concepts":[{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.6499000191688538},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5924999713897705},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5547999739646912},{"id":"https://openalex.org/C137800194","wikidata":"https://www.wikidata.org/wiki/Q11713455","display_name":"Interpolation (computer graphics)","level":3,"score":0.5095999836921692},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4828999936580658},{"id":"https://openalex.org/C102634674","wikidata":"https://www.wikidata.org/wiki/Q868473","display_name":"Smoothness","level":2,"score":0.47760000824928284},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.46299999952316284},{"id":"https://openalex.org/C2781089630","wikidata":"https://www.wikidata.org/wiki/Q21856745","display_name":"Realization (probability)","level":2,"score":0.43880000710487366},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4350000023841858},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.4138000011444092},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.4074999988079071},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.39079999923706055},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38179999589920044},{"id":"https://openalex.org/C194232998","wikidata":"https://www.wikidata.org/wiki/Q1606712","display_name":"Transition (genetics)","level":3,"score":0.34929999709129333},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.337799996137619},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.319599986076355},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.30160000920295715},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.2939000129699707},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.2915000021457672},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.287200003862381},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.25609999895095825},{"id":"https://openalex.org/C177291462","wikidata":"https://www.wikidata.org/wiki/Q423038","display_name":"Active listening","level":2,"score":0.25049999356269836}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.27376","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27376","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.27376","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27376","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"score":0.6502318978309631,"display_name":"Gender equality","id":"https://metadata.un.org/sdg/5"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"prompt-based":[1,60],"text-to-speech":[2],"(TTS)":[3],"models":[4],"enable":[5],"natural":[6],"language-driven":[7],"speaking":[8],"style":[9,22,34,41,65,73,87,91],"control,":[10],"they":[11],"often":[12],"provide":[13],"limited":[14],"fine-grained":[15],"control":[16],"and":[17,39,79,124,149,165],"apply":[18],"a":[19,44,96,136,160],"single":[20,45],"global":[21],"across":[23,37],"an":[24],"utterance.":[25,46],"This":[26],"restricts":[27],"practical":[28],"use":[29],"cases":[30],"that":[31,130],"require":[32],"continuous":[33],"attribute":[35],"interpolation":[36,134],"utterances":[38],"time-varying":[40],"transitions":[42,85],"within":[43],"In":[47],"this":[48,118],"paper,":[49],"we":[50,67,93,120],"propose":[51],"novel":[52],"techniques":[53],"to":[54,112,144,151],"achieve":[55],"both":[56],"capabilities":[57],"in":[58,75,103,140],"existing":[59],"TTS":[61,105],"models.":[62],"For":[63,89],"inter-utterance":[64,133],"interpolation,":[66,82],"compute":[68],"direction":[69],"vectors":[70],"between":[71,86],"contrastive":[72],"prompts":[74],"the":[76,108],"embedding":[77],"space":[78],"perform":[80],"simple":[81],"enabling":[83],"smooth":[84],"characteristics.":[88],"intra-utterance":[90,157],"transition,":[92],"first":[94],"identify":[95],"strong":[97],"attention":[98,126],"bias":[99],"toward":[100],"early":[101],"tokens":[102],"autoregressive":[104],"decoders,":[106],"causing":[107],"initial":[109],"audio":[110],"realization":[111],"dominate":[113],"subsequent":[114],"generation.":[115],"To":[116],"mitigate":[117],"effect,":[119],"introduce":[121],"KV-cache":[122],"swapping":[123],"sliding-window":[125],"masking.":[127],"Experiments":[128],"demonstrate":[129],"our":[131],"proposed":[132],"achieves":[135,166],"99-100%":[137],"success":[138],"rate":[139],"gender":[141],"conversion,":[142],"up":[143,150],"36":[145],"Hz":[146],"pitch":[147],"variation,":[148],"1.6":[152],"syllables-per-second":[153],"speed":[154],"change.":[155],"Our":[156],"transition":[158],"maintains":[159],"speaker":[161],"similarity":[162],"of":[163,170],"0.81-0.91":[164],"perceptual":[167],"smoothness":[168],"scores":[169],"3.48-4.48.":[171]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-29T00:00:00"}
