{"id":"https://openalex.org/W4295036296","doi":"https://doi.org/10.1109/taslp.2022.3202126","title":"ParaTTS: Learning Linguistic and Prosodic Cross-Sentence Information in Paragraph-Based TTS","display_name":"ParaTTS: Learning Linguistic and Prosodic Cross-Sentence Information in Paragraph-Based TTS","publication_year":2022,"publication_date":"2022-01-01","ids":{"openalex":"https://openalex.org/W4295036296","doi":"https://doi.org/10.1109/taslp.2022.3202126"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2022.3202126","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2022.3202126","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5009337933","display_name":"Liumeng Xue","orcid":"https://orcid.org/0000-0003-2815-8494"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Liumeng Xue","raw_affiliation_strings":["Audio, Speech and Langauge Processing Group (ASLP@NWPU) and School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"Audio, Speech and Langauge Processing Group (ASLP@NWPU) and School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065394791","display_name":"Frank K. Soong","orcid":"https://orcid.org/0000-0002-9088-3577"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Frank K. Soong","raw_affiliation_strings":["Microsoft Research Asia (MSRA), Beijing, China"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Asia (MSRA), Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084317268","display_name":"Shaofei Zhang","orcid":"https://orcid.org/0000-0002-8377-2270"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shaofei Zhang","raw_affiliation_strings":["Microsoft Azure Speech, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Microsoft Azure Speech, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100668966","display_name":"Lei Xie","orcid":"https://orcid.org/0000-0001-8234-0823"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Xie","raw_affiliation_strings":["Audio, Speech and Langauge Processing Group (ASLP@NWPU) and School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"Audio, Speech and Langauge Processing Group (ASLP@NWPU) and School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5009337933"],"corresponding_institution_ids":["https://openalex.org/I17145004"],"apc_list":null,"apc_paid":null,"fwci":2.1211,"has_fulltext":false,"cited_by_count":16,"citation_normalized_percentile":{"value":0.89050227,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":"30","issue":null,"first_page":"2854","last_page":"2864"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/paragraph","display_name":"Paragraph","score":0.9157353043556213},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.7918161749839783},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7130861878395081},{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.6194452047348022},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.508740246295929},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5014405250549316},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.46817559003829956},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.46427008509635925},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.44217729568481445}],"concepts":[{"id":"https://openalex.org/C2777206241","wikidata":"https://www.wikidata.org/wiki/Q194431","display_name":"Paragraph","level":2,"score":0.9157353043556213},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.7918161749839783},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7130861878395081},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.6194452047348022},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.508740246295929},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5014405250549316},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46817559003829956},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.46427008509635925},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.44217729568481445},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2022.3202126","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2022.3202126","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.7599999904632568}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":58,"referenced_works":["https://openalex.org/W75438095","https://openalex.org/W1574447377","https://openalex.org/W1582784770","https://openalex.org/W1585826837","https://openalex.org/W1739762571","https://openalex.org/W1983920823","https://openalex.org/W2000633092","https://openalex.org/W2100873065","https://openalex.org/W2101652647","https://openalex.org/W2105482032","https://openalex.org/W2124449078","https://openalex.org/W2139795704","https://openalex.org/W2150612204","https://openalex.org/W2153252552","https://openalex.org/W2171121512","https://openalex.org/W2189624215","https://openalex.org/W2249384744","https://openalex.org/W2484713420","https://openalex.org/W2531207078","https://openalex.org/W2612376646","https://openalex.org/W2807717950","https://openalex.org/W2903739847","https://openalex.org/W2963091184","https://openalex.org/W2963609956","https://openalex.org/W2964243274","https://openalex.org/W2967220154","https://openalex.org/W2972375191","https://openalex.org/W2972885185","https://openalex.org/W2990883660","https://openalex.org/W2991206245","https://openalex.org/W3015922793","https://openalex.org/W3097538987","https://openalex.org/W3102451458","https://openalex.org/W3113076898","https://openalex.org/W3151309757","https://openalex.org/W3161113899","https://openalex.org/W3161732385","https://openalex.org/W3162746464","https://openalex.org/W3163339651","https://openalex.org/W3195592874","https://openalex.org/W4395957996","https://openalex.org/W4395961138","https://openalex.org/W6604963828","https://openalex.org/W6606855072","https://openalex.org/W6631190155","https://openalex.org/W6634864274","https://openalex.org/W6640212811","https://openalex.org/W6739901393","https://openalex.org/W6749555683","https://openalex.org/W6755207826","https://openalex.org/W6763832098","https://openalex.org/W6767111847","https://openalex.org/W6778823374","https://openalex.org/W6780493881","https://openalex.org/W6783039324","https://openalex.org/W6783867762","https://openalex.org/W6785242122","https://openalex.org/W6865570193"],"related_works":["https://openalex.org/W2377059580","https://openalex.org/W127000293","https://openalex.org/W2391800119","https://openalex.org/W2052919063","https://openalex.org/W2595239241","https://openalex.org/W2799181378","https://openalex.org/W3003711649","https://openalex.org/W2904173691","https://openalex.org/W4365517254","https://openalex.org/W270947280"],"abstract_inverted_index":{"Recent":[0],"advancements":[1],"in":[2,14,35,48,58,73,96,115,126,139,242,247],"neural":[3],"end-to-end":[4],"text-to-speech":[5],"(TTS)":[6],"models":[7],"have":[8],"shown":[9],"high-quality,":[10],"natural":[11,174],"synthesized":[12],"speech":[13,177,229],"a":[15,30,38,50,87,97,116,127,140,146,151,159],"conventional":[16],"sentence-based":[17,200,240],"TTS.":[18],"However,":[19],"it":[20,170],"is":[21,33,118,129,142,235],"still":[22],"challenging":[23],"to":[24,45,62,91],"reproduce":[25],"similar":[26,211],"high":[27],"quality":[28],"when":[29],"whole":[31],"paragraph":[32,98,117,128,141,204,221],"considered":[34,47],"TTS,":[36],"where":[37],"large":[39],"amount":[40],"of":[41,206,223],"contextual":[42,181],"information":[43,67,94,114,125],"needs":[44],"be":[46,193],"building":[49],"paragraph-based":[51],"TTS":[52,166,228],"model.":[53,201],"To":[54],"alleviate":[55],"the":[56,93,100,103,123,164,199,208,219,224,227,232,239],"difficulty":[57],"training,":[59],"we":[60,107],"propose":[61],"model":[63,167,234,241],"linguistic":[64],"and":[65,80,99,110,122,175,186,196,245],"prosodic":[66,187],"by":[68,120,145,158,231],"considering":[69],"cross-sentence,":[70],"embedded":[71,95],"structure":[72],"training.":[74],"Three":[75],"sub-modules,":[76],"including":[77],"linguistics-aware,":[78],"prosody-aware":[79,111],"sentence-position":[81,147],"networks,":[82],"are":[83,210],"trained":[84],"together":[85],"with":[86,131],"modified":[88],"Tacotron2.":[89],"Specifically,":[90],"learn":[92],"relations":[101],"among":[102],"corresponding":[104],"component":[105],"sentences,":[106,191],"utilize":[108],"linguistics-aware":[109],"networks.":[112],"The":[113,135,179],"captured":[119],"encoders":[121],"inter-sentence":[124],"learned":[130],"multi-head":[132],"attention":[133],"mechanisms.":[134],"relative":[136],"sentence":[137],"position":[138],"explicitly":[143],"exploited":[144],"network.":[148],"Trained":[149],"on":[150,203],"storytelling":[152],"audio-book":[153],"corpus":[154],"(4.08":[155],"hours),":[156],"recorded":[157],"female":[160],"Mandarin":[161],"Chinese":[162],"speaker,":[163],"proposed":[165],"demonstrates":[168],"that":[169],"can":[171,192],"produce":[172],"rather":[173],"good-quality":[176],"paragraph-wise.":[178],"cross-sentence":[180],"information,":[182],"such":[183],"as":[184],"break":[185],"variations":[188],"between":[189],"consecutive":[190],"better":[194],"predicted":[195],"rendered":[197],"than":[198,218],"Tested":[202],"texts,":[205],"which":[207],"lengths":[209],"to,":[212],"longer":[213,217],"than,":[214],"or":[215],"much":[216],"typical":[220],"length":[222],"training":[225],"data,":[226],"produced":[230],"new":[233],"consistently":[236],"preferred":[237],"over":[238],"subjective":[243],"tests":[244],"confirmed":[246],"objective":[248],"measures.":[249]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":7},{"year":2023,"cited_by_count":7}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
