{"id":"https://openalex.org/W2972598731","doi":"https://doi.org/10.21437/ssw.2019-10","title":"DNN-based Speaker Embedding Using Subjective Inter-speaker Similarity for Multi-speaker Modeling in Speech Synthesis","display_name":"DNN-based Speaker Embedding Using Subjective Inter-speaker Similarity for Multi-speaker Modeling in Speech Synthesis","publication_year":2019,"publication_date":"2019-09-14","ids":{"openalex":"https://openalex.org/W2972598731","doi":"https://doi.org/10.21437/ssw.2019-10","mag":"2972598731"},"language":"en","primary_location":{"id":"doi:10.21437/ssw.2019-10","is_oa":false,"landing_page_url":"https://doi.org/10.21437/ssw.2019-10","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"10th ISCA Workshop on Speech Synthesis (SSW 10)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5083394213","display_name":"Yuki Saito","orcid":"https://orcid.org/0000-0002-7967-2613"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yuki Saito","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013050263","display_name":"Shinnosuke Takamichi","orcid":"https://orcid.org/0000-0003-0520-7847"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shinnosuke Takamichi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5003814223","display_name":"Hiroshi Saruwatari","orcid":"https://orcid.org/0000-0003-0876-5617"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hiroshi Saruwatari","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5083394213"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.8401,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.80497955,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"51","last_page":"56"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.988099992275238,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9786999821662903,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6778864860534668},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.674892008304596},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.672481894493103},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.6458278894424438},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.645728588104248},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.5890389680862427},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4968724548816681},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4714955687522888},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.458998441696167},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4527454972267151}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6778864860534668},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.674892008304596},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.672481894493103},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.6458278894424438},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.645728588104248},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.5890389680862427},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4968724548816681},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4714955687522888},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.458998441696167},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4527454972267151},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/ssw.2019-10","is_oa":false,"landing_page_url":"https://doi.org/10.21437/ssw.2019-10","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"10th ISCA Workshop on Speech Synthesis (SSW 10)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/5","score":0.4099999964237213,"display_name":"Gender equality"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W1491159402","https://openalex.org/W4297807400","https://openalex.org/W4313854686","https://openalex.org/W4389984014","https://openalex.org/W2144208207","https://openalex.org/W1509309911","https://openalex.org/W1599425004","https://openalex.org/W2118860825","https://openalex.org/W2096510939"],"abstract_inverted_index":{"This":[0],"paper":[1],"proposes":[2],"novel":[3],"algorithms":[4,69,161],"for":[5,52,70,199],"speaker":[6,21,50,72,105,163,178],"embedding":[7,22,73,91,114,164,179,194],"using":[8,75],"subjective":[9,42,83,171],"inter-speaker":[10,43,77,137,146],"similarity":[11,44,78,89,102,112,127,138,147,192],"based":[12,87,110],"on":[13,88,111],"deep":[14],"neural":[15],"networks":[16],"(DNNs).":[17],"Although":[18],"conventional":[19],"DNN-based":[20,71,184],"such":[23],"as":[24,104],"a":[25,98],"$d$-vector":[26],"can":[27],"be":[28],"applied":[29],"to":[30,96,119,180],"multi-speaker":[31,181],"modeling":[32,182],"in":[33,61,183],"speech":[34,56,185,197,203],"synthesis,":[35],"it":[36],"does":[37],"not":[38,47,59],"correlate":[39],"with":[40,169],"the":[41,62,94,101,117,121,126,130,136,141,145,155,170,176,190,208],"and":[45,92,115,129,154,187],"is":[46,86,109,166],"necessarily":[48],"appropriate":[49],"representation":[51],"open":[53,200],"speakers":[54,201],"whose":[55,202],"utterances":[57,204],"are":[58,205],"included":[60],"training":[63,68],"data.":[64],"We":[65,143,173],"propose":[66],"two":[67],"model":[74,95,118],"an":[76],"matrix":[79,103,113,128,132],"obtained":[80],"by":[81],"large-scale":[82],"scoring.":[84],"One":[85],"vector":[90,99,193],"trains":[93,116],"predict":[97],"of":[100,133,149],"representation.":[106],"The":[107],"other":[108],"minimize":[120],"squared":[122],"Frobenius":[123],"norm":[124],"between":[125],"Gram":[131],"$d$-vectors,":[134],"i.e.,":[135],"derived":[139],"from":[140],"$d$-vectors.":[142],"crowdsourced":[144],"scores":[148],"153":[150],"Japanese":[151],"female":[152],"speakers,":[153],"experimental":[156],"results":[157],"demonstrate":[158],"that":[159,165,189],"our":[160],"learn":[162],"highly":[167],"correlated":[168],"similarity.":[172],"also":[174],"apply":[175],"proposed":[177,191],"synthesis":[186],"reveal":[188],"improves":[195],"synthetic":[196],"quality":[198],"unseen":[206],"during":[207],"training.":[209]},"counts_by_year":[{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":2},{"year":2020,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
