{"id":"https://openalex.org/W2981934523","doi":"https://doi.org/10.1109/icassp40776.2020.9054556","title":"Mellotron: Multispeaker Expressive Voice Synthesis by Conditioning on Rhythm, Pitch and Global Style Tokens","display_name":"Mellotron: Multispeaker Expressive Voice Synthesis by Conditioning on Rhythm, Pitch and Global Style Tokens","publication_year":2020,"publication_date":"2020-04-09","ids":{"openalex":"https://openalex.org/W2981934523","doi":"https://doi.org/10.1109/icassp40776.2020.9054556","mag":"2981934523"},"language":"en","primary_location":{"id":"doi:10.1109/icassp40776.2020.9054556","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp40776.2020.9054556","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/1910.11997","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113576287","display_name":"Rafael Valle","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Rafael Valle","raw_affiliation_strings":["NVIDIA Corporation","NVIDIA Corporation#TAB#"],"affiliations":[{"raw_affiliation_string":"NVIDIA Corporation","institution_ids":["https://openalex.org/I4210127875"]},{"raw_affiliation_string":"NVIDIA Corporation#TAB#","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100762970","display_name":"Jason Li","orcid":"https://orcid.org/0000-0002-1150-3549"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jason Li","raw_affiliation_strings":["NVIDIA Corporation","NVIDIA Corporation#TAB#"],"affiliations":[{"raw_affiliation_string":"NVIDIA Corporation","institution_ids":["https://openalex.org/I4210127875"]},{"raw_affiliation_string":"NVIDIA Corporation#TAB#","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010578786","display_name":"Ryan Prenger","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ryan Prenger","raw_affiliation_strings":["NVIDIA Corporation","NVIDIA Corporation#TAB#"],"affiliations":[{"raw_affiliation_string":"NVIDIA Corporation","institution_ids":["https://openalex.org/I4210127875"]},{"raw_affiliation_string":"NVIDIA Corporation#TAB#","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5066242985","display_name":"Bryan Catanzaro","orcid":"https://orcid.org/0000-0003-0034-7728"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bryan Catanzaro","raw_affiliation_strings":["NVIDIA Corporation","NVIDIA Corporation#TAB#"],"affiliations":[{"raw_affiliation_string":"NVIDIA Corporation","institution_ids":["https://openalex.org/I4210127875"]},{"raw_affiliation_string":"NVIDIA Corporation#TAB#","institution_ids":["https://openalex.org/I4210127875"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5113576287"],"corresponding_institution_ids":["https://openalex.org/I4210127875"],"apc_list":null,"apc_paid":null,"fwci":2.21663517,"has_fulltext":true,"cited_by_count":17,"citation_normalized_percentile":{"value":0.88459767,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"6189","last_page":"6193"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9962999820709229,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.7820435762405396},{"id":"https://openalex.org/keywords/rhythm","display_name":"Rhythm","score":0.7770313024520874},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7555068731307983},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6396591067314148},{"id":"https://openalex.org/keywords/emotive","display_name":"Emotive","score":0.5949445366859436},{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.5038382411003113},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.48140573501586914},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.479056715965271},{"id":"https://openalex.org/keywords/communication","display_name":"Communication","score":0.3503449559211731},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.23604676127433777},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.17187923192977905},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.12156403064727783},{"id":"https://openalex.org/keywords/art","display_name":"Art","score":0.08666479587554932}],"concepts":[{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.7820435762405396},{"id":"https://openalex.org/C135343436","wikidata":"https://www.wikidata.org/wiki/Q170406","display_name":"Rhythm","level":2,"score":0.7770313024520874},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7555068731307983},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6396591067314148},{"id":"https://openalex.org/C2776215170","wikidata":"https://www.wikidata.org/wiki/Q5373820","display_name":"Emotive","level":2,"score":0.5949445366859436},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.5038382411003113},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.48140573501586914},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.479056715965271},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.3503449559211731},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.23604676127433777},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.17187923192977905},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.12156403064727783},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.08666479587554932},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C124952713","wikidata":"https://www.wikidata.org/wiki/Q8242","display_name":"Literature","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":5,"locations":[{"id":"doi:10.1109/icassp40776.2020.9054556","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp40776.2020.9054556","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:1910.11997","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1910.11997","pdf_url":"https://arxiv.org/pdf/1910.11997","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"mag:2981934523","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/1910.11997.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.1910.11997","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.1910.11997","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"},{"id":"doi:10.17023/2cwt-w520","is_oa":true,"landing_page_url":"https://doi.org/10.17023/2cwt-w520","pdf_url":null,"source":{"id":"https://openalex.org/S7407051697","display_name":"IEEE RESOURCE CENTERS","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:1910.11997","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1910.11997","pdf_url":"https://arxiv.org/pdf/1910.11997","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.6399999856948853,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2981934523.pdf","grobid_xml":"https://content.openalex.org/works/W2981934523.grobid-xml"},"referenced_works_count":29,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1555851617","https://openalex.org/W1572665971","https://openalex.org/W1976069042","https://openalex.org/W2091425152","https://openalex.org/W2107740512","https://openalex.org/W2130086727","https://openalex.org/W2515336442","https://openalex.org/W2747874407","https://openalex.org/W2792995953","https://openalex.org/W2794490148","https://openalex.org/W2795109282","https://openalex.org/W2932319281","https://openalex.org/W2962824709","https://openalex.org/W2963300588","https://openalex.org/W2963927338","https://openalex.org/W2964243274","https://openalex.org/W2966506747","https://openalex.org/W2972359262","https://openalex.org/W2973046048","https://openalex.org/W2973215447","https://openalex.org/W6631190155","https://openalex.org/W6633268702","https://openalex.org/W6634181976","https://openalex.org/W6675938391","https://openalex.org/W6749489859","https://openalex.org/W6750489868","https://openalex.org/W6765987481","https://openalex.org/W6917585676"],"related_works":["https://openalex.org/W3015645837","https://openalex.org/W2964243274","https://openalex.org/W2794490148","https://openalex.org/W3146962208","https://openalex.org/W2994863420","https://openalex.org/W2995670387","https://openalex.org/W2471520273","https://openalex.org/W2935697391","https://openalex.org/W3135864795","https://openalex.org/W3097152652","https://openalex.org/W3200345197","https://openalex.org/W2066925125","https://openalex.org/W150843187","https://openalex.org/W3160706245","https://openalex.org/W3146104336","https://openalex.org/W2112931636","https://openalex.org/W144392981","https://openalex.org/W1945282639","https://openalex.org/W2781826204","https://openalex.org/W2964150074"],"abstract_inverted_index":{"Mellotron":[0,42,77],"is":[1,43],"a":[2,15,49],"multispeaker":[3],"voice":[4,16,68],"synthesis":[5],"model":[6],"based":[7],"on":[8,29],"Tacotron":[9],"2":[10],"GST":[11],"that":[12,107],"can":[13],"make":[14],"emote":[17],"and":[18,31,65,87,96,104,115,125,127],"sing":[19],"without":[20,83],"emotive":[21],"or":[22,39],"singing":[23,70],"training":[24],"data.":[25],"By":[26],"explicitly":[27],"conditioning":[28],"rhythm":[30,124],"continuous":[32],"pitch":[33,126],"contours":[34],"from":[35,54,60,66,111],"an":[36],"audio":[37],"signal":[38],"music":[40],"score,":[41],"able":[44],"to":[45,57,63,69],"generate":[46],"speech":[47,56,81],"in":[48],"variety":[50],"of":[51,123],"styles":[52,116],"ranging":[53],"read":[55,80],"expressive":[58],"speech,":[59],"slow":[61],"drawls":[62],"rap":[64],"monotonous":[67],"voice.":[71],"Unlike":[72],"other":[73,112],"methods,":[74],"we":[75],"train":[76],"using":[78,93],"only":[79],"data":[82],"alignments":[84],"between":[85],"text":[86],"audio.":[88],"We":[89,99],"evaluate":[90],"our":[91],"models":[92],"the":[94],"LJSpeech":[95],"LibriTTS":[97],"datasets.":[98],"provide":[100],"F0":[101],"Frame":[102],"Errors":[103],"synthesized":[105],"samples":[106],"include":[108],"style":[109],"transfer":[110],"speakers,":[113],"singers":[114],"not":[117],"seen":[118],"during":[119],"training,":[120],"procedural":[121],"manipulation":[122],"choir":[128],"synthesis.":[129]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":2},{"year":2021,"cited_by_count":6},{"year":2020,"cited_by_count":7},{"year":2019,"cited_by_count":1}],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2025-10-10T00:00:00"}
