{"id":"https://openalex.org/W4391021635","doi":"https://doi.org/10.1109/asru57964.2023.10389629","title":"HIGNN-TTS: Hierarchical Prosody Modeling With Graph Neural Networks for Expressive Long-Form TTS","display_name":"HIGNN-TTS: Hierarchical Prosody Modeling With Graph Neural Networks for Expressive Long-Form TTS","publication_year":2023,"publication_date":"2023-12-16","ids":{"openalex":"https://openalex.org/W4391021635","doi":"https://doi.org/10.1109/asru57964.2023.10389629"},"language":"en","primary_location":{"id":"doi:10.1109/asru57964.2023.10389629","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru57964.2023.10389629","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101377824","display_name":"Dake Guo","orcid":null},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Dake Guo","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101883211","display_name":"Xinfa Zhu","orcid":"https://orcid.org/0000-0001-9275-523X"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinfa Zhu","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009337933","display_name":"Liumeng Xue","orcid":"https://orcid.org/0000-0003-2815-8494"},"institutions":[{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liumeng Xue","raw_affiliation_strings":["The Chinese University of Hong Kong, Shenzhen (CUHK-Shenzhen),School of Data Science,China","School of Data Science, The Chinese University of Hong Kong, Shenzhen (CUHK-Shenzhen), China"],"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong, Shenzhen (CUHK-Shenzhen),School of Data Science,China","institution_ids":["https://openalex.org/I4210116924"]},{"raw_affiliation_string":"School of Data Science, The Chinese University of Hong Kong, Shenzhen (CUHK-Shenzhen), China","institution_ids":["https://openalex.org/I4210116924"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100455376","display_name":"Tao Li","orcid":"https://orcid.org/0000-0003-1697-8022"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tao Li","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108604484","display_name":"Yuanjun Lv","orcid":"https://orcid.org/0009-0002-2272-9153"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuanjun Lv","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109786295","display_name":"Yuepeng Jiang","orcid":null},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuepeng Jiang","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100668966","display_name":"Lei Xie","orcid":"https://orcid.org/0000-0001-8234-0823"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Xie","raw_affiliation_strings":["Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,Audio, Speech and Language Processing Group (ASLP@NPU), School of Computer Science,Xi&#x2019;an,China","institution_ids":["https://openalex.org/I17145004"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5101377824"],"corresponding_institution_ids":["https://openalex.org/I17145004"],"apc_list":null,"apc_paid":null,"fwci":0.6993,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.76964468,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.8827307224273682},{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.7195287346839905},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6969671845436096},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.6165404319763184},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.5720096826553345},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5150551795959473},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.48825979232788086},{"id":"https://openalex.org/keywords/node","display_name":"Node (physics)","score":0.44649070501327515},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.4428202509880066},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4348594546318054},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.12282395362854004},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.054277628660202026}],"concepts":[{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.8827307224273682},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.7195287346839905},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6969671845436096},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.6165404319763184},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.5720096826553345},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5150551795959473},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48825979232788086},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.44649070501327515},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.4428202509880066},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4348594546318054},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.12282395362854004},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.054277628660202026},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C66938386","wikidata":"https://www.wikidata.org/wiki/Q633538","display_name":"Structural engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru57964.2023.10389629","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru57964.2023.10389629","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.7599999904632568,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1731081199","https://openalex.org/W2107092366","https://openalex.org/W2254292464","https://openalex.org/W2471520273","https://openalex.org/W2963609956","https://openalex.org/W2964138190","https://openalex.org/W2967220154","https://openalex.org/W2972881244","https://openalex.org/W2973158936","https://openalex.org/W2973217961","https://openalex.org/W3082097605","https://openalex.org/W3113076898","https://openalex.org/W3135644023","https://openalex.org/W3152136404","https://openalex.org/W3163339651","https://openalex.org/W3187966659","https://openalex.org/W4221165968","https://openalex.org/W4226230180","https://openalex.org/W4226421465","https://openalex.org/W4283832640","https://openalex.org/W4285603330","https://openalex.org/W4287212799","https://openalex.org/W4295036296","https://openalex.org/W4372260509","https://openalex.org/W4372266971","https://openalex.org/W4372341043","https://openalex.org/W4372346850","https://openalex.org/W4385329631","https://openalex.org/W6637618735","https://openalex.org/W6691808304","https://openalex.org/W6749555683","https://openalex.org/W6750489868","https://openalex.org/W6755207826","https://openalex.org/W6763701032","https://openalex.org/W6763832098","https://openalex.org/W6778823374","https://openalex.org/W6796464841"],"related_works":["https://openalex.org/W169399214","https://openalex.org/W4391272374","https://openalex.org/W1914543332","https://openalex.org/W1984347656","https://openalex.org/W2946856121","https://openalex.org/W10581632","https://openalex.org/W1927421023","https://openalex.org/W2108985546","https://openalex.org/W2081919107","https://openalex.org/W2433276473"],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,59,118],"text-to-speech,":[3],"particularly":[4],"those":[5],"based":[6],"on":[7,95],"Graph":[8],"Neural":[9],"Networks":[10],"(GNNs),":[11],"have":[12],"significantly":[13,130],"improved":[14],"the":[15,40,60,64,77,99,103,114,132],"expressiveness":[16,135],"of":[17,42,66,81,98,116,136],"short-form":[18],"synthetic":[19,138],"speech.":[20],"However,":[21],"generating":[22],"human-parity":[23],"long-form":[24,137],"speech":[25,139],"with":[26,44,106],"high":[27,108],"dynamic":[28,109],"prosodic":[29,104],"variations":[30,105],"is":[31],"still":[32],"challenging.":[33],"To":[34],"address":[35],"this":[36],"problem,":[37],"we":[38,53,88],"expand":[39],"capabilities":[41],"GNNs":[43,82],"a":[45,55,71,107],"hierarchical":[46,90,120],"prosody":[47,78,94],"modeling":[48,79],"approach,":[49],"named":[50],"HiGNNTTS.":[51],"Specifically,":[52],"add":[54],"virtual":[56],"global":[57],"node":[58,97],"graph":[61,100],"to":[62,75,85,101],"strengthen":[63],"interconnection":[65],"word":[67],"nodes":[68],"and":[69,124,134],"introduce":[70],"contextual":[72],"attention":[73],"mechanism":[74],"broaden":[76],"scope":[80],"from":[83,92],"intra-sentence":[84],"inter-sentence.":[86],"Additionally,":[87],"perform":[89],"supervision":[91],"acoustic":[93],"each":[96],"capture":[102],"range.":[110],"Ablation":[111],"studies":[112],"show":[113],"effectiveness":[115],"HiGNN-TTS":[117,129],"learning":[119],"prosody.":[121],"Both":[122],"objective":[123],"subjective":[125],"evaluations":[126],"demonstrate":[127],"that":[128],"improves":[131],"naturalness":[133],"<sup":[140,144],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[141,145],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[142,146],".":[143],"Speech":[147],"samples:":[148],"https://dukguo.github.io/HiGNN-TTS/":[149]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
