{"id":"https://openalex.org/W3131643166","doi":"https://doi.org/10.1109/taslp.2021.3059114","title":"Perceptual-Similarity-Aware Deep Speaker Representation Learning for Multi-Speaker Generative Modeling","display_name":"Perceptual-Similarity-Aware Deep Speaker Representation Learning for Multi-Speaker Generative Modeling","publication_year":2021,"publication_date":"2021-01-01","ids":{"openalex":"https://openalex.org/W3131643166","doi":"https://doi.org/10.1109/taslp.2021.3059114","mag":"3131643166"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2021.3059114","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2021.3059114","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/9289074/09354556.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://ieeexplore.ieee.org/ielx7/6570655/9289074/09354556.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5083394213","display_name":"Yuki Saito","orcid":"https://orcid.org/0000-0002-7967-2613"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Yuki Saito","raw_affiliation_strings":["Graduate School of Information Science and Technology, The University of Tokyo, Bunkyo-ku, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Information Science and Technology, The University of Tokyo, Bunkyo-ku, Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013050263","display_name":"Shinnosuke Takamichi","orcid":"https://orcid.org/0000-0003-0520-7847"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shinnosuke Takamichi","raw_affiliation_strings":["Graduate School of Information Science and Technology, The University of Tokyo, Bunkyo-ku, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Information Science and Technology, The University of Tokyo, Bunkyo-ku, Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5003814223","display_name":"Hiroshi Saruwatari","orcid":"https://orcid.org/0000-0003-0876-5617"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Hiroshi Saruwatari","raw_affiliation_strings":["Graduate School of Information Science and Technology, The University of Tokyo, Bunkyo-ku, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Information Science and Technology, The University of Tokyo, Bunkyo-ku, Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5083394213"],"corresponding_institution_ids":["https://openalex.org/I74801974"],"apc_list":null,"apc_paid":null,"fwci":1.3996,"has_fulltext":true,"cited_by_count":10,"citation_normalized_percentile":{"value":0.84522586,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":"29","issue":null,"first_page":"1033","last_page":"1048"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9943000078201294,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7046116590499878},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.662555992603302},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6163108944892883},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.5975069999694824},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5441119074821472},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.5316107273101807},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.5258767604827881},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.5171553492546082},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.4905811846256256},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4107913672924042},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.3022042512893677}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7046116590499878},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.662555992603302},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6163108944892883},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.5975069999694824},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5441119074821472},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.5316107273101807},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.5258767604827881},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.5171553492546082},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.4905811846256256},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4107913672924042},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.3022042512893677},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2021.3059114","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2021.3059114","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/9289074/09354556.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1109/taslp.2021.3059114","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2021.3059114","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/9289074/09354556.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.7300000190734863,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[{"id":"https://openalex.org/G1069223013","display_name":null,"funder_award_id":"JSPS KAKENHI","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G1688345242","display_name":null,"funder_award_id":"17H06101","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G3459562248","display_name":null,"funder_award_id":"Grant","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G4636223006","display_name":null,"funder_award_id":"JSPS KAK","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G5522210649","display_name":"\u805e\u304d\u624b\u30e2\u30c7\u30eb\u306b\u57fa\u3065\u304f\u80fd\u52d5\u7684\u97f3\u58f0\u5408\u6210\u306b\u95a2\u3059\u308b\u7814\u7a76","funder_award_id":"18J22090","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G8072092738","display_name":null,"funder_award_id":"18K18100","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G8935930414","display_name":null,"funder_award_id":"19H01116","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"}],"funders":[{"id":"https://openalex.org/F4320334764","display_name":"Japan Society for the Promotion of Science","ror":"https://ror.org/00hhkn466"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3131643166.pdf","grobid_xml":"https://content.openalex.org/works/W3131643166.grobid-xml"},"referenced_works_count":86,"referenced_works":["https://openalex.org/W11364309","https://openalex.org/W1523372075","https://openalex.org/W1959608418","https://openalex.org/W1975879668","https://openalex.org/W2005522781","https://openalex.org/W2033516035","https://openalex.org/W2046056978","https://openalex.org/W2049686551","https://openalex.org/W2062930223","https://openalex.org/W2101491865","https://openalex.org/W2115040572","https://openalex.org/W2120605154","https://openalex.org/W2121750345","https://openalex.org/W2129142580","https://openalex.org/W2146502635","https://openalex.org/W2150769028","https://openalex.org/W2153914468","https://openalex.org/W2154278880","https://openalex.org/W2154920538","https://openalex.org/W2156142001","https://openalex.org/W2156387975","https://openalex.org/W2157825442","https://openalex.org/W2221409856","https://openalex.org/W2395578248","https://openalex.org/W2518172956","https://openalex.org/W2519091744","https://openalex.org/W2567070169","https://openalex.org/W2605320104","https://openalex.org/W2606602301","https://openalex.org/W2608207374","https://openalex.org/W2612872092","https://openalex.org/W2746498480","https://openalex.org/W2749651610","https://openalex.org/W2759925408","https://openalex.org/W2766005220","https://openalex.org/W2787685498","https://openalex.org/W2788357188","https://openalex.org/W2793479148","https://openalex.org/W2804998325","https://openalex.org/W2808149027","https://openalex.org/W2808706139","https://openalex.org/W2890964092","https://openalex.org/W2903158431","https://openalex.org/W2907492528","https://openalex.org/W2940200615","https://openalex.org/W2946200149","https://openalex.org/W2948211236","https://openalex.org/W2949382160","https://openalex.org/W2962691331","https://openalex.org/W2963192573","https://openalex.org/W2963432880","https://openalex.org/W2963470929","https://openalex.org/W2963609956","https://openalex.org/W2963971656","https://openalex.org/W2964168155","https://openalex.org/W2970730223","https://openalex.org/W2972598731","https://openalex.org/W2972848589","https://openalex.org/W2998249245","https://openalex.org/W3015070858","https://openalex.org/W3015338123","https://openalex.org/W3015687355","https://openalex.org/W3015826515","https://openalex.org/W3023257870","https://openalex.org/W3033411150","https://openalex.org/W3096894480","https://openalex.org/W3097152652","https://openalex.org/W3124688585","https://openalex.org/W3130016944","https://openalex.org/W4210257598","https://openalex.org/W4293693810","https://openalex.org/W6600453306","https://openalex.org/W6631309588","https://openalex.org/W6640963894","https://openalex.org/W6681435938","https://openalex.org/W6682889407","https://openalex.org/W6711777497","https://openalex.org/W6731370813","https://openalex.org/W6736204136","https://openalex.org/W6748588790","https://openalex.org/W6752888775","https://openalex.org/W6756615331","https://openalex.org/W6763100362","https://openalex.org/W6763832098","https://openalex.org/W6772230580","https://openalex.org/W6778823374"],"related_works":["https://openalex.org/W4287995534","https://openalex.org/W2998168123","https://openalex.org/W2061273563","https://openalex.org/W2905846897","https://openalex.org/W2970216048","https://openalex.org/W2937349443","https://openalex.org/W2507989420","https://openalex.org/W2963049565","https://openalex.org/W3173553047","https://openalex.org/W3006036127"],"abstract_inverted_index":{"We":[0,76],"propose":[1,77,126],"novel":[2],"deep":[3,29,38],"speaker":[4,25,30,34,57,86,101,105,137,166,183,244],"representation":[5,31,35,79,179],"learning":[6,32,36,80,129,180,214],"that":[7,82,131,175],"considers":[8],"perceptual":[9,85,92,134,188],"similarity":[10,87,168,233,245],"among":[11,230],"speakers":[12],"for":[13,60,246],"multi-speaker":[14,45],"generative":[15,46,62],"modeling.":[16,47],"Following":[17],"its":[18],"success":[19],"in":[20,67,198,256],"accurate":[21,142],"discriminative":[22,51,208],"modeling":[23],"of":[24,28,73,94,111,116,147,163,224],"individuality,":[26],"knowledge":[27],"(i.e.,":[33],"using":[37],"neural":[39],"networks)":[40],"has":[41],"been":[42],"introduced":[43],"to":[44,103,156],"However,":[48],"the":[49,112,118,133,151,161,164,177,192,211,231,239,242,250,253,257],"conventional":[50,204],"algorithm":[52,130,152,215],"does":[53],"not":[54],"necessarily":[55],"learn":[56,104,182],"embeddings":[58,106,143,184,193],"suitable":[59],"such":[61],"modeling,":[63,209],"which":[64],"may":[65],"result":[66],"lower":[68],"quality":[69,197,220],"and":[70,121,136,149,226,228,249],"less":[71],"controllability":[72],"synthetic":[74,195,218,247,258],"speech.":[75],"three":[78,108],"algorithms":[81,98,181],"utilize":[83],"a":[84,100,114,122],"matrix":[88],"obtained":[89],"by":[90,207],"large-scale":[91],"scoring":[93,135,148,225],"speaker-pair":[95,189],"similarity.":[96],"The":[97],"train":[99],"encoder":[102,138],"with":[107,187],"different":[109],"representations":[110],"matrix:":[113],"set":[115],"vectors,":[117],"Gram":[119],"matrix,":[120,235],"graph.":[123],"Furthermore,":[124],"we":[125],"an":[127],"active":[128,213],"iterates":[132],"training.":[139],"To":[140],"obtain":[141],"while":[144,221],"reducing":[145,222],"costs":[146,223],"training,":[150,227],"selects":[153],"unscored":[154],"speaker-pairs":[155],"be":[157],"scored":[158],"next":[159],"on":[160],"basis":[162],"sequentially-trained":[165],"encoder's":[167],"prediction":[169],"results.":[170],"Experimental":[171],"evaluation":[172],"results":[173],"show":[174],"1)":[176],"proposed":[178,212,232],"strongly":[185],"correlated":[186],"similarity,":[190],"2)":[191],"improve":[194],"speech":[196,199,219,248,259],"autoencoding":[200],"tasks":[201],"better":[202],"than":[203],"d-vectors":[205],"learned":[206],"3)":[210],"achieves":[216,241],"higher":[217],"4)":[229],"{vector,":[234],"graph}":[236],"embedding":[237],"algorithms,":[238],"first":[240],"best":[243],"third":[251],"gives":[252],"most":[254],"improvement":[255],"naturalness.":[260]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":3},{"year":2021,"cited_by_count":2}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
