{"id":"https://openalex.org/W2972374322","doi":"https://doi.org/10.21437/interspeech.2019-3232","title":"VQVAE Unsupervised Unit Discovery and Multi-Scale Code2Spec Inverter for Zerospeech Challenge 2019","display_name":"VQVAE Unsupervised Unit Discovery and Multi-Scale Code2Spec Inverter for Zerospeech Challenge 2019","publication_year":2019,"publication_date":"2019-09-13","ids":{"openalex":"https://openalex.org/W2972374322","doi":"https://doi.org/10.21437/interspeech.2019-3232","mag":"2972374322"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2019-3232","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2019-3232","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2019","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5038296765","display_name":"Andros Tjandra","orcid":"https://orcid.org/0000-0003-1246-5908"},"institutions":[{"id":"https://openalex.org/I75917431","display_name":"Nara Institute of Science and Technology","ror":"https://ror.org/05bhada84","country_code":"JP","type":"education","lineage":["https://openalex.org/I75917431"]},{"id":"https://openalex.org/I4210126580","display_name":"RIKEN Center for Advanced Intelligence Project","ror":"https://ror.org/03ckxwf91","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210126580"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Andros Tjandra","raw_affiliation_strings":["RIKEN, Center for Advanced Intelligence Project AIP, Japan,","Nara Institute of Science and Technology, Japan"],"affiliations":[{"raw_affiliation_string":"RIKEN, Center for Advanced Intelligence Project AIP, Japan,","institution_ids":["https://openalex.org/I4210126580"]},{"raw_affiliation_string":"Nara Institute of Science and Technology, Japan","institution_ids":["https://openalex.org/I75917431"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001303929","display_name":"Berrak \u015ei\u015fman","orcid":"https://orcid.org/0000-0001-8078-3305"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Berrak Sisman","raw_affiliation_strings":["Department of Electrical and Computer Engineering, National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100432806","display_name":"Mingyang Zhang","orcid":"https://orcid.org/0000-0001-6517-2880"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Mingyang Zhang","raw_affiliation_strings":["Department of Electrical and Computer Engineering, National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040108974","display_name":"Sakriani Sakti","orcid":"https://orcid.org/0000-0001-5509-8963"},"institutions":[{"id":"https://openalex.org/I4210126580","display_name":"RIKEN Center for Advanced Intelligence Project","ror":"https://ror.org/03ckxwf91","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210126580"]},{"id":"https://openalex.org/I75917431","display_name":"Nara Institute of Science and Technology","ror":"https://ror.org/05bhada84","country_code":"JP","type":"education","lineage":["https://openalex.org/I75917431"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Sakriani Sakti","raw_affiliation_strings":["RIKEN, Center for Advanced Intelligence Project AIP, Japan,","Nara Institute of Science and Technology, Japan"],"affiliations":[{"raw_affiliation_string":"RIKEN, Center for Advanced Intelligence Project AIP, Japan,","institution_ids":["https://openalex.org/I4210126580"]},{"raw_affiliation_string":"Nara Institute of Science and Technology, Japan","institution_ids":["https://openalex.org/I75917431"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032690182","display_name":"Haizhou Li","orcid":"https://orcid.org/0000-0001-9158-9401"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Haizhou Li","raw_affiliation_strings":["Department of Electrical and Computer Engineering, National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5020994673","display_name":"Satoshi Nakamura","orcid":"https://orcid.org/0000-0001-6956-3803"},"institutions":[{"id":"https://openalex.org/I4210126580","display_name":"RIKEN Center for Advanced Intelligence Project","ror":"https://ror.org/03ckxwf91","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210126580"]},{"id":"https://openalex.org/I75917431","display_name":"Nara Institute of Science and Technology","ror":"https://ror.org/05bhada84","country_code":"JP","type":"education","lineage":["https://openalex.org/I75917431"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Satoshi Nakamura","raw_affiliation_strings":["Nara Institute of Science and Technology, Japan","RIKEN, Center for Advanced Intelligence Project AIP, Japan,"],"affiliations":[{"raw_affiliation_string":"Nara Institute of Science and Technology, Japan","institution_ids":["https://openalex.org/I75917431"]},{"raw_affiliation_string":"RIKEN, Center for Advanced Intelligence Project AIP, Japan,","institution_ids":["https://openalex.org/I4210126580"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5038296765"],"corresponding_institution_ids":["https://openalex.org/I4210126580","https://openalex.org/I75917431"],"apc_list":null,"apc_paid":null,"fwci":6.9377,"has_fulltext":false,"cited_by_count":63,"citation_normalized_percentile":{"value":0.97502377,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1118","last_page":"1122"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codebook","display_name":"Codebook","score":0.914882242679596},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7302719950675964},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6932677626609802},{"id":"https://openalex.org/keywords/intelligibility","display_name":"Intelligibility (philosophy)","score":0.6843138933181763},{"id":"https://openalex.org/keywords/vector-quantization","display_name":"Vector quantization","score":0.5596473217010498},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.5272873044013977},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.482060045003891},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.47126245498657227},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.46821215748786926},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.46503016352653503},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.45714855194091797},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4392443895339966},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.4346424341201782},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.17689797282218933}],"concepts":[{"id":"https://openalex.org/C127759330","wikidata":"https://www.wikidata.org/wiki/Q637416","display_name":"Codebook","level":2,"score":0.914882242679596},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7302719950675964},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6932677626609802},{"id":"https://openalex.org/C60048801","wikidata":"https://www.wikidata.org/wiki/Q1433889","display_name":"Intelligibility (philosophy)","level":2,"score":0.6843138933181763},{"id":"https://openalex.org/C199833920","wikidata":"https://www.wikidata.org/wiki/Q612536","display_name":"Vector quantization","level":2,"score":0.5596473217010498},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.5272873044013977},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.482060045003891},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.47126245498657227},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.46821215748786926},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46503016352653503},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.45714855194091797},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4392443895339966},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.4346424341201782},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.17689797282218933},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2019-3232","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2019-3232","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2019","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.5600000023841858,"id":"https://metadata.un.org/sdg/10"}],"awards":[{"id":"https://openalex.org/G1069223013","display_name":null,"funder_award_id":"JSPS KAKENHI","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G1688345242","display_name":null,"funder_award_id":"17H06101","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G3459562248","display_name":null,"funder_award_id":"Grant","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G4636223006","display_name":null,"funder_award_id":"JSPS KAK","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G4827429566","display_name":null,"funder_award_id":"Grant Numbers","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G6616750388","display_name":null,"funder_award_id":"JP17H06101","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G8430481527","display_name":null,"funder_award_id":"Number","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G8777250574","display_name":"Research for unsupervised acoustic pattern discovery with zero resources","funder_award_id":"17K00237","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G947068629","display_name":null,"funder_award_id":"JP17K00237","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"}],"funders":[{"id":"https://openalex.org/F4320334764","display_name":"Japan Society for the Promotion of Science","ror":"https://ror.org/00hhkn466"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1836465849","https://openalex.org/W1959608418","https://openalex.org/W2101234009","https://openalex.org/W2120847449","https://openalex.org/W2191779130","https://openalex.org/W2399576818","https://openalex.org/W2519091744","https://openalex.org/W2547039119","https://openalex.org/W2593414223","https://openalex.org/W2666408839","https://openalex.org/W2786608204","https://openalex.org/W2787447541","https://openalex.org/W2888858245","https://openalex.org/W2899771611","https://openalex.org/W2911340057","https://openalex.org/W2940544976","https://openalex.org/W2962699523","https://openalex.org/W2962896155","https://openalex.org/W2963796886","https://openalex.org/W2963799213","https://openalex.org/W2963971656","https://openalex.org/W2964069186","https://openalex.org/W3125709657","https://openalex.org/W4300047444","https://openalex.org/W4320013936","https://openalex.org/W4394670483"],"related_works":["https://openalex.org/W2148772884","https://openalex.org/W2017514583","https://openalex.org/W2100120615","https://openalex.org/W2147406819","https://openalex.org/W2352648934","https://openalex.org/W1929869830","https://openalex.org/W2387054321","https://openalex.org/W2017401491","https://openalex.org/W2062765737","https://openalex.org/W2377001183"],"abstract_inverted_index":{"We":[0],"describe":[1],"our":[2,141],"submitted":[3],"system":[4,30,52],"for":[5],"the":[6,14,44,51,56,60,65,68,71,79,106,118,125,132,136,158,171,175,183,190],"ZeroSpeech":[7,185],"Challenge":[8],"2019.The":[9],"current":[10],"challenge":[11],"theme":[12],"addresses":[13],"difficulty":[15],"of":[16,70],"constructing":[17],"a":[18,29,47,84,91,109,128],"speech":[19,45,107],"synthesizer":[20],"without":[21],"any":[22],"text":[23],"or":[24,188],"phonetic":[25],"labels":[26],"and":[27,41,64,67,77,90,101,121,152,154,164,177],"requires":[28],"that":[31],"can":[32],"(1)":[33],"discover":[34],"subword":[35],"units":[36],"in":[37],"an":[38],"unsupervised":[39],"way,":[40],"(2)":[42],"synthesize":[43],"with":[46,157],"target":[48,133],"speaker's":[49],"voice.Moreover,":[50],"should":[53],"also":[54,144],"balance":[55],"discrimination":[57,178],"score":[58],"ABX,":[59],"bit-rate":[61],"compression":[62],"rate,":[63],"naturalness":[66],"intelligibility":[69,172],"constructed":[72],"voice.To":[73],"tackle":[74],"these":[75],"problems":[76],"achieve":[78],"best":[80],"tradeoff,":[81],"we":[82,143],"utilize":[83],"vector":[85],"quantized":[86],"variational":[87],"autoencoder":[88],"(VQ-VAE)":[89],"multi-scale":[92],"codebook-tospectrogram":[93],"(Code2Spec)":[94],"inverter":[95,126],"trained":[96],"by":[97],"mean":[98],"square":[99],"error":[100],"adversarial":[102],"loss.The":[103],"VQ-VAE":[104,159],"extracts":[105],"to":[108,114,131,182],"latent":[110],"space,":[111],"forces":[112],"itself":[113],"map":[115],"it":[116],"into":[117],"nearest":[119],"codebook":[120,137],"produces":[122],"compressed":[123],"representation.Next,":[124],"generates":[127],"magnitude":[129],"spectrogram":[130],"voice,":[134],"given":[135],"vectors":[138],"from":[139],"VQ-VAE.In":[140],"experiments,":[142],"investigated":[145],"several":[146],"other":[147],"clustering":[148],"algorithms,":[149],"including":[150],"K-Means":[151],"GMM,":[153],"compared":[155,181],"them":[156],"result":[160],"on":[161],"ABX":[162,179],"scores":[163,180],"bit":[165],"rates.Our":[166],"proposed":[167],"approach":[168],"significantly":[169],"improved":[170],"(in":[173],"CER),":[174],"MOS,":[176],"official":[184],"2019":[186],"baseline":[187],"even":[189],"topline.":[191]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":10},{"year":2021,"cited_by_count":15},{"year":2020,"cited_by_count":19},{"year":2019,"cited_by_count":4}],"updated_date":"2026-03-13T16:22:10.518609","created_date":"2025-10-10T00:00:00"}
