{"id":"https://openalex.org/W4210544855","doi":"https://doi.org/10.1109/asru51503.2021.9687876","title":"Towards Using Heterogeneous Relation Graphs for End-to-End TTS","display_name":"Towards Using Heterogeneous Relation Graphs for End-to-End TTS","publication_year":2021,"publication_date":"2021-12-13","ids":{"openalex":"https://openalex.org/W4210544855","doi":"https://doi.org/10.1109/asru51503.2021.9687876"},"language":"en","primary_location":{"id":"doi:10.1109/asru51503.2021.9687876","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru51503.2021.9687876","pdf_url":null,"source":{"id":"https://openalex.org/S4363606113","display_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5062151201","display_name":"Amrith Setlur","orcid":null},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Amrith Setlur","raw_affiliation_strings":["Carnegie Mellon University,Machine Learning Department"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University,Machine Learning Department","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085667632","display_name":"Aman Madaan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Aman Madaan","raw_affiliation_strings":["Language Technologies Institute"],"affiliations":[{"raw_affiliation_string":"Language Technologies Institute","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067393636","display_name":"Tanmay Parekh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tanmay Parekh","raw_affiliation_strings":["Language Technologies Institute"],"affiliations":[{"raw_affiliation_string":"Language Technologies Institute","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041434096","display_name":"Yiming Yang","orcid":"https://orcid.org/0000-0003-1359-0364"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yiming Yang","raw_affiliation_strings":["Language Technologies Institute"],"affiliations":[{"raw_affiliation_string":"Language Technologies Institute","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5107337645","display_name":"Alan W. Black","orcid":"https://orcid.org/0000-0001-8820-8831"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Alan W Black","raw_affiliation_strings":["Language Technologies Institute"],"affiliations":[{"raw_affiliation_string":"Language Technologies Institute","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5062151201"],"corresponding_institution_ids":["https://openalex.org/I74973139"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.21845205,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"56","issue":null,"first_page":"1162","last_page":"1169"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7454861402511597},{"id":"https://openalex.org/keywords/parametric-statistics","display_name":"Parametric statistics","score":0.6217535138130188},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.5814251899719238},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5456713438034058},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.545005738735199},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4760836660861969},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.42671629786491394},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.42028796672821045},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.4197336733341217},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.09057098627090454}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7454861402511597},{"id":"https://openalex.org/C117251300","wikidata":"https://www.wikidata.org/wiki/Q1849855","display_name":"Parametric statistics","level":2,"score":0.6217535138130188},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.5814251899719238},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5456713438034058},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.545005738735199},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4760836660861969},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.42671629786491394},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.42028796672821045},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.4197336733341217},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.09057098627090454},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru51503.2021.9687876","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru51503.2021.9687876","pdf_url":null,"source":{"id":"https://openalex.org/S4363606113","display_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W58567859","https://openalex.org/W70888257","https://openalex.org/W202879582","https://openalex.org/W1522301498","https://openalex.org/W1595709623","https://openalex.org/W1902237438","https://openalex.org/W1970354675","https://openalex.org/W2064675550","https://openalex.org/W2120847449","https://openalex.org/W2129142580","https://openalex.org/W2130750986","https://openalex.org/W2133564696","https://openalex.org/W2150658333","https://openalex.org/W2519091744","https://openalex.org/W2745387029","https://openalex.org/W2752796333","https://openalex.org/W2903739847","https://openalex.org/W2963300588","https://openalex.org/W2963609956","https://openalex.org/W2963799213","https://openalex.org/W2964015378","https://openalex.org/W2964168688","https://openalex.org/W2964243274","https://openalex.org/W3015621018","https://openalex.org/W3095505419","https://openalex.org/W3096684845","https://openalex.org/W3100270690","https://openalex.org/W3112336664","https://openalex.org/W3162271107","https://openalex.org/W6608197479","https://openalex.org/W6631190155","https://openalex.org/W6635459924","https://openalex.org/W6679434410","https://openalex.org/W6726873649","https://openalex.org/W6736996214","https://openalex.org/W6787396906"],"related_works":["https://openalex.org/W3179968364","https://openalex.org/W1999612375","https://openalex.org/W4293226380","https://openalex.org/W2938107654","https://openalex.org/W4390516098","https://openalex.org/W2151749779","https://openalex.org/W3008587939","https://openalex.org/W2181948922","https://openalex.org/W2384362569","https://openalex.org/W2142795561"],"abstract_inverted_index":{"Neural":[0],"models":[1,22,38,110],"for":[2],"end-to-end":[3],"text-to-speech":[4,120],"(TTS)":[5],"synthe-sis":[6],"are":[7],"increasingly":[8],"outperforming":[9],"traditional":[10],"approaches":[11],"in":[12,19,65,126,132],"statistical":[13,36],"parametric":[14,37],"speech":[15],"synthesis.":[16],"Speech":[17],"generation":[18],"these":[20],"neural":[21,62,108],"predominantly":[23],"relies":[24],"on":[25,41,138],"using":[26],"free-form":[27],"text":[28],"as":[29,128],"the":[30,34,52,66,96,104,123,140,146,156],"input":[31],"modality.":[32],"However,":[33],"earlier":[35],"were":[39],"built":[40],"encoded":[42],"phonetic":[43],"and":[44,80,135,150,169],"syn-tactic":[45],"features.":[46],"In":[47],"this":[48],"work,":[49],"we":[50,84],"explore":[51],"possibility":[53],"of":[54,68,77,95,106,172],"explicitly":[55],"feeding":[56],"deterministic":[57],"linguistic":[58],"structure":[59],"to":[60,89,103],"a":[61],"TTS":[63,109],"system":[64],"form":[67],"Heterogeneous":[69],"Relational":[70],"Graphs":[71],"(HRGs),":[72],"an":[73],"expressive":[74],"formalism":[75],"capable":[76],"representing":[78],"pho-netic":[79],"syntactic":[81,124],"information.":[82],"Specifically,":[83],"use":[85],"Graph":[86],"Convolutional":[87],"Networks":[88],"learn":[90],"structurally":[91],"informed":[92],"contin-uous":[93],"representations":[94],"HRGs,":[97],"which":[98],"can":[99],"be":[100,175],"seamlessly":[101],"passed":[102],"encoders":[105],"popular":[107],"like":[111],"TransformerTTS":[112],"or":[113],"Tacotron.":[114],"Furthermore,":[115],"our":[116,170],"simple":[117],"HRG":[118],"based":[119],"synthesis":[121],"leverages":[122],"bias":[125],"HRGs":[127,173],"demonstrated":[129],"by":[130],"improvements":[131],"automated":[133],"met-rics":[134],"human":[136],"evaluation":[137],"i)":[139],"single":[141],"speaker":[142],"dataset":[143,148,171],"LJSpeech;":[144],"ii)":[145],"multi-speaker":[147],"Arctic;":[149],"iii)":[151],"out-of-domain":[152],"test":[153],"sets":[154],"from":[155],"Blizzard":[157],"challenge.":[158],"<sup":[159,162],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[160,163],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[161,164],"The":[165],"code,":[166],"trained":[167],"models,":[168],"will":[174],"released":[176],"at":[177],"https://github.com/ars22/GraphNeuralTTS/.":[178]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
