{"id":"https://openalex.org/W4372183465","doi":"https://doi.org/10.1109/icassp49357.2023.10097113","title":"MID-Attribute Speaker Generation Using Optimal-Transport-Based Interpolation of Gaussian Mixture Models","display_name":"MID-Attribute Speaker Generation Using Optimal-Transport-Based Interpolation of Gaussian Mixture Models","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372183465","doi":"https://doi.org/10.1109/icassp49357.2023.10097113"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10097113","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp49357.2023.10097113","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5008391815","display_name":"Aya Watanabe","orcid":"https://orcid.org/0000-0003-3123-489X"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Aya Watanabe","raw_affiliation_strings":["The University of Tokyo,Japan","The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"The University of Tokyo,Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013050263","display_name":"Shinnosuke Takamichi","orcid":"https://orcid.org/0000-0003-0520-7847"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shinnosuke Takamichi","raw_affiliation_strings":["The University of Tokyo,Japan","The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"The University of Tokyo,Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042731264","display_name":"Yuki Saito","orcid":"https://orcid.org/0000-0002-4825-7783"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yuki Saito","raw_affiliation_strings":["The University of Tokyo,Japan","The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"The University of Tokyo,Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072398606","display_name":"Detai Xin","orcid":"https://orcid.org/0009-0007-1908-1137"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Detai Xin","raw_affiliation_strings":["The University of Tokyo,Japan","The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"The University of Tokyo,Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5003814223","display_name":"Hiroshi Saruwatari","orcid":"https://orcid.org/0000-0003-0876-5617"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Hiroshi Saruwatari","raw_affiliation_strings":["The University of Tokyo,Japan","The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"The University of Tokyo,Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5008391815"],"corresponding_institution_ids":["https://openalex.org/I74801974"],"apc_list":null,"apc_paid":null,"fwci":0.2038,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.39805947,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":"37","issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9865999817848206,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpolation","display_name":"Interpolation (computer graphics)","score":0.7651628255844116},{"id":"https://openalex.org/keywords/mixture-model","display_name":"Mixture model","score":0.6661732196807861},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6146790981292725},{"id":"https://openalex.org/keywords/gaussian","display_name":"Gaussian","score":0.5553492903709412},{"id":"https://openalex.org/keywords/gaussian-process","display_name":"Gaussian process","score":0.5134207010269165},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4444599747657776},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3462083339691162},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.33823102712631226},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.32011187076568604},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.09161233901977539}],"concepts":[{"id":"https://openalex.org/C137800194","wikidata":"https://www.wikidata.org/wiki/Q11713455","display_name":"Interpolation (computer graphics)","level":3,"score":0.7651628255844116},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.6661732196807861},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6146790981292725},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.5553492903709412},{"id":"https://openalex.org/C61326573","wikidata":"https://www.wikidata.org/wiki/Q1496376","display_name":"Gaussian process","level":3,"score":0.5134207010269165},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4444599747657776},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3462083339691162},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33823102712631226},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.32011187076568604},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.09161233901977539},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10097113","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp49357.2023.10097113","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.41999998688697815}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1585160083","https://openalex.org/W1968333723","https://openalex.org/W2054723044","https://openalex.org/W2064675550","https://openalex.org/W2071048859","https://openalex.org/W2106751088","https://openalex.org/W2519091744","https://openalex.org/W2527729766","https://openalex.org/W2619368999","https://openalex.org/W2788357188","https://openalex.org/W2808706139","https://openalex.org/W2963181447","https://openalex.org/W2963609956","https://openalex.org/W3015826515","https://openalex.org/W3016159759","https://openalex.org/W3025528898","https://openalex.org/W3033411150","https://openalex.org/W3081800019","https://openalex.org/W3092028330","https://openalex.org/W3095199334","https://openalex.org/W3128910262","https://openalex.org/W3161704465","https://openalex.org/W3178584958","https://openalex.org/W3196584150","https://openalex.org/W3197186969","https://openalex.org/W4224929761","https://openalex.org/W4233762729","https://openalex.org/W4283397032","https://openalex.org/W6676225911","https://openalex.org/W6727697161","https://openalex.org/W6748588790","https://openalex.org/W6752888775","https://openalex.org/W6765987481","https://openalex.org/W6778083308","https://openalex.org/W6778823374","https://openalex.org/W6783867762","https://openalex.org/W6790220310"],"related_works":["https://openalex.org/W1975321310","https://openalex.org/W1952261593","https://openalex.org/W2014494654","https://openalex.org/W2990323019","https://openalex.org/W1516392727","https://openalex.org/W1578916557","https://openalex.org/W1992295166","https://openalex.org/W1964286703","https://openalex.org/W2143508933","https://openalex.org/W2169866437"],"abstract_inverted_index":{"In":[0],"this":[1,55,86],"paper,":[2],"we":[3,88],"propose":[4,89],"a":[5,27,144],"method":[6,38,56,92,111,136],"for":[7],"intermediating":[8],"multiple":[9],"speakers\u2019":[10,141],"attributes":[11,142],"and":[12,112,119,127],"diversifying":[13],"their":[14],"voice":[15],"characteristics":[16],"in":[17],"\u201cspeaker":[18],"generation,\u201d":[19],"an":[20,80,90],"emerging":[21],"task":[22],"that":[23,93,134],"aims":[24],"to":[25,98],"synthesize":[26],"nonexistent":[28,100],"speaker\u2019s":[29],"naturally":[30],"sounding":[31],"voice.":[32],"The":[33,130],"conventional":[34],"TacoSpawn-based":[35],"speaker":[36,43,52,124],"generation":[37],"represents":[39],"the":[40,58,64,73,95,114,120,139],"distributions":[41,75],"of":[42,60,116,122,152],"embeddings":[44],"by":[45,143],"Gaussian":[46],"mixture":[47],"models":[48],"(GMMs)":[49],"conditioned":[50],"with":[51,79,102],"attributes.":[53],"Although":[54],"enables":[57],"sampling":[59],"various":[61],"speakers":[62,78,101],"from":[63],"speaker-attribute-aware":[65],"GMMs,":[66],"it":[67],"is":[68],"not":[69],"yet":[70],"clear":[71],"whether":[72],"learned":[74,96],"can":[76,137],"represent":[77],"intermediate":[81],"attribute":[82],"(i.e.,":[83],"mid-attribute).":[84],"To":[85],"end,":[87],"optimal-transport-based":[91],"interpolates":[94],"GMMs":[97],"generate":[99],"mid-attribute":[103],"(e.g.,":[104],"gender-neutral)":[105],"voices.":[106],"We":[107],"empirically":[108],"validate":[109],"our":[110,135],"evaluate":[113],"naturalness":[115],"synthetic":[117],"speech":[118,153],"controllability":[121],"two":[123],"attributes:":[125],"gender":[126],"language":[128],"fluency.":[129],"evaluation":[131],"results":[132],"show":[133],"control":[138],"generated":[140],"continuous":[145],"scalar":[146],"value":[147],"without":[148],"statistically":[149],"significant":[150],"degradation":[151],"naturalness.":[154]},"counts_by_year":[{"year":2023,"cited_by_count":1}],"updated_date":"2025-12-24T23:09:58.560324","created_date":"2025-10-10T00:00:00"}
