{"id":"https://openalex.org/W3015853838","doi":"https://doi.org/10.1109/icassp40776.2020.9053371","title":"Semi-Supervised Speaker Adaptation for End-to-End Speech Synthesis with Pretrained Models","display_name":"Semi-Supervised Speaker Adaptation for End-to-End Speech Synthesis with Pretrained Models","publication_year":2020,"publication_date":"2020-04-09","ids":{"openalex":"https://openalex.org/W3015853838","doi":"https://doi.org/10.1109/icassp40776.2020.9053371","mag":"3015853838"},"language":"en","primary_location":{"id":"doi:10.1109/icassp40776.2020.9053371","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp40776.2020.9053371","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5082312736","display_name":"Katsuki Inoue","orcid":null},"institutions":[{"id":"https://openalex.org/I163770644","display_name":"Okayama University","ror":"https://ror.org/02pc6pc55","country_code":"JP","type":"education","lineage":["https://openalex.org/I163770644"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Katsuki Inoue","raw_affiliation_strings":["Okayama University"],"affiliations":[{"raw_affiliation_string":"Okayama University","institution_ids":["https://openalex.org/I163770644"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070197974","display_name":"Sunao Hara","orcid":"https://orcid.org/0000-0001-5972-5178"},"institutions":[{"id":"https://openalex.org/I163770644","display_name":"Okayama University","ror":"https://ror.org/02pc6pc55","country_code":"JP","type":"education","lineage":["https://openalex.org/I163770644"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Sunao Hara","raw_affiliation_strings":["Okayama University"],"affiliations":[{"raw_affiliation_string":"Okayama University","institution_ids":["https://openalex.org/I163770644"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063057571","display_name":"Masanobu Abe","orcid":"https://orcid.org/0000-0001-9472-5077"},"institutions":[{"id":"https://openalex.org/I163770644","display_name":"Okayama University","ror":"https://ror.org/02pc6pc55","country_code":"JP","type":"education","lineage":["https://openalex.org/I163770644"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Masanobu Abe","raw_affiliation_strings":["Okayama University"],"affiliations":[{"raw_affiliation_string":"Okayama University","institution_ids":["https://openalex.org/I163770644"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078778981","display_name":"Tomoki Hayashi","orcid":"https://orcid.org/0000-0001-8782-4093"},"institutions":[{"id":"https://openalex.org/I60134161","display_name":"Nagoya University","ror":"https://ror.org/04chrp450","country_code":"JP","type":"education","lineage":["https://openalex.org/I60134161"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Tomoki Hayashi","raw_affiliation_strings":["Nagoya University"],"affiliations":[{"raw_affiliation_string":"Nagoya University","institution_ids":["https://openalex.org/I60134161"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100669485","display_name":"Ryuichi Yamamoto","orcid":"https://orcid.org/0000-0003-0299-5470"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ryuichi Yamamoto","raw_affiliation_strings":["LINE Corp"],"affiliations":[{"raw_affiliation_string":"LINE Corp","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5001291873","display_name":"Shinji Watanabe","orcid":"https://orcid.org/0000-0002-5970-8631"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shinji Watanabe","raw_affiliation_strings":["Johns Hopkins University"],"affiliations":[{"raw_affiliation_string":"Johns Hopkins University","institution_ids":["https://openalex.org/I145311948"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5082312736"],"corresponding_institution_ids":["https://openalex.org/I163770644"],"apc_list":null,"apc_paid":null,"fwci":1.5907,"has_fulltext":false,"cited_by_count":16,"citation_normalized_percentile":{"value":0.86579818,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"7634","last_page":"7638"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9940999746322632,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9940000176429749,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8300865888595581},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7745059728622437},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.758432149887085},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.7047565579414368},{"id":"https://openalex.org/keywords/adaptation","display_name":"Adaptation (eye)","score":0.5734866857528687},{"id":"https://openalex.org/keywords/mel-frequency-cepstrum","display_name":"Mel-frequency cepstrum","score":0.47083568572998047},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.46316468715667725},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.4351573586463928},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.4180656969547272},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.2262948453426361}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8300865888595581},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7745059728622437},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.758432149887085},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.7047565579414368},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.5734866857528687},{"id":"https://openalex.org/C151989614","wikidata":"https://www.wikidata.org/wiki/Q440370","display_name":"Mel-frequency cepstrum","level":3,"score":0.47083568572998047},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46316468715667725},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.4351573586463928},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.4180656969547272},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2262948453426361},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp40776.2020.9053371","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp40776.2020.9053371","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":58,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1524333225","https://openalex.org/W2064675550","https://openalex.org/W2102003408","https://openalex.org/W2102113734","https://openalex.org/W2111284386","https://openalex.org/W2133564696","https://openalex.org/W2160815625","https://openalex.org/W2184045248","https://openalex.org/W2519091744","https://openalex.org/W2749651610","https://openalex.org/W2788357188","https://openalex.org/W2808706139","https://openalex.org/W2885185669","https://openalex.org/W2890964092","https://openalex.org/W2892140764","https://openalex.org/W2899771611","https://openalex.org/W2940544976","https://openalex.org/W2946200149","https://openalex.org/W2949382160","https://openalex.org/W2962699523","https://openalex.org/W2962780374","https://openalex.org/W2962826786","https://openalex.org/W2963192573","https://openalex.org/W2963250244","https://openalex.org/W2963403868","https://openalex.org/W2963432880","https://openalex.org/W2963609956","https://openalex.org/W2963691546","https://openalex.org/W2963739817","https://openalex.org/W2963796886","https://openalex.org/W2964243274","https://openalex.org/W2964308564","https://openalex.org/W2970730223","https://openalex.org/W2972359262","https://openalex.org/W2972818416","https://openalex.org/W2972889948","https://openalex.org/W2972970915","https://openalex.org/W2973026522","https://openalex.org/W2973034126","https://openalex.org/W2996414377","https://openalex.org/W3007328579","https://openalex.org/W3016160783","https://openalex.org/W3101689408","https://openalex.org/W4385245566","https://openalex.org/W6629717138","https://openalex.org/W6631362777","https://openalex.org/W6675365184","https://openalex.org/W6679434410","https://openalex.org/W6736996214","https://openalex.org/W6739901393","https://openalex.org/W6748588790","https://openalex.org/W6749489859","https://openalex.org/W6752888775","https://openalex.org/W6754925833","https://openalex.org/W6756040250","https://openalex.org/W6763832098","https://openalex.org/W6917585676"],"related_works":["https://openalex.org/W3119288895","https://openalex.org/W2185075503","https://openalex.org/W2186375278","https://openalex.org/W2749720872","https://openalex.org/W2793748347","https://openalex.org/W4229940372","https://openalex.org/W4250904811","https://openalex.org/W2155047054","https://openalex.org/W2104528589","https://openalex.org/W2379120504"],"abstract_inverted_index":{"Recently,":[0],"end-to-end":[1,62,91,117],"text-to-speech":[2],"(TTS)":[3],"models":[4,78],"have":[5],"achieved":[6,128],"a":[7,12,38,110,132],"remarkable":[8],"performance,":[9],"however,":[10],"requiring":[11],"large":[13],"amount":[14],"of":[15,34,48,60,85,139],"paired":[16,133],"text":[17,43,82,100],"and":[18,68,143],"speech":[19,35,64],"data":[20,101,134],"for":[21,37],"training.":[22],"On":[23],"the":[24,52,56,104],"other":[25],"hand,":[26],"we":[27],"can":[28,93,107],"easily":[29],"collect":[30],"unpaired":[31],"dozen":[32],"minutes":[33],"recordings":[36],"target":[39],"speaker":[40,111,141],"without":[41],"corresponding":[42,70],"data.":[44],"To":[45],"make":[46],"use":[47],"such":[49,98],"accessible":[50],"data,":[51],"proposed":[53,105,126],"method":[54,106,127,136],"leverages":[55],"recent":[57],"great":[58],"success":[59],"state-of-the-art":[61],"automatic":[63],"recognition":[65],"(ASR)":[66],"systems":[67],"obtains":[69],"transcriptions":[71],"from":[72],"pretrained":[73],"ASR":[74],"models.":[75],"Although":[76],"these":[77],"could":[79],"only":[80],"provide":[81],"output":[83],"instead":[84],"intermediate":[86],"linguistic":[87],"features":[88],"like":[89],"phonemes,":[90],"TTS":[92],"be":[94],"well":[95],"trained":[96],"with":[97],"raw":[99],"directly.":[102],"Thus,":[103],"greatly":[108],"simplify":[109],"adaptation":[112,135],"pipeline":[113],"by":[114],"consistently":[115],"employing":[116],"ASR/TTS":[118],"ecosystems.":[119],"The":[120],"experimental":[121],"results":[122],"show":[123],"that":[124],"our":[125],"comparable":[129],"performance":[130],"to":[131],"in":[137],"terms":[138],"subjective":[140],"similarity":[142],"objective":[144],"cepstral":[145],"distance":[146],"measures.":[147]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":4},{"year":2021,"cited_by_count":5},{"year":2020,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
