{"id":"https://openalex.org/W3027324582","doi":"https://doi.org/10.21437/interspeech.2020-1693","title":"Vector-Quantized Neural Networks for Acoustic Unit Discovery in the ZeroSpeech 2020 Challenge","display_name":"Vector-Quantized Neural Networks for Acoustic Unit Discovery in the ZeroSpeech 2020 Challenge","publication_year":2020,"publication_date":"2020-10-25","ids":{"openalex":"https://openalex.org/W3027324582","doi":"https://doi.org/10.21437/interspeech.2020-1693","mag":"3027324582"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2020-1693","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2020-1693","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2020","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2005.09409","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5046382731","display_name":"Benjamin van Niekerk","orcid":"https://orcid.org/0000-0001-9207-6309"},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":true,"raw_author_name":"Benjamin van Niekerk","raw_affiliation_strings":["Stellenbosch University, Stellenbosch, South Africa"],"affiliations":[{"raw_affiliation_string":"Stellenbosch University, Stellenbosch, South Africa","institution_ids":["https://openalex.org/I26092322"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020033363","display_name":"Leanne Nortje","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Leanne Nortje","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5040305929","display_name":"Herman Kamper","orcid":"https://orcid.org/0000-0003-2980-3475"},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":false,"raw_author_name":"Herman Kamper","raw_affiliation_strings":["StellenBosch University"],"affiliations":[{"raw_affiliation_string":"StellenBosch University","institution_ids":["https://openalex.org/I26092322"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5046382731"],"corresponding_institution_ids":["https://openalex.org/I26092322"],"apc_list":null,"apc_paid":null,"fwci":2.1941,"has_fulltext":true,"cited_by_count":16,"citation_normalized_percentile":{"value":0.90050477,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"4836","last_page":"4840"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/vector-quantization","display_name":"Vector quantization","score":0.7824497222900391},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7032983899116516},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.649599015712738},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.601822555065155},{"id":"https://openalex.org/keywords/learning-vector-quantization","display_name":"Learning vector quantization","score":0.45076119899749756},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4415537714958191},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.42452383041381836},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.4192032516002655},{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.4185929298400879},{"id":"https://openalex.org/keywords/phone","display_name":"Phone","score":0.41814783215522766},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.41228818893432617},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.21320819854736328}],"concepts":[{"id":"https://openalex.org/C199833920","wikidata":"https://www.wikidata.org/wiki/Q612536","display_name":"Vector quantization","level":2,"score":0.7824497222900391},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7032983899116516},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.649599015712738},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.601822555065155},{"id":"https://openalex.org/C40567965","wikidata":"https://www.wikidata.org/wiki/Q1820283","display_name":"Learning vector quantization","level":3,"score":0.45076119899749756},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4415537714958191},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.42452383041381836},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.4192032516002655},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.4185929298400879},{"id":"https://openalex.org/C2778707766","wikidata":"https://www.wikidata.org/wiki/Q202064","display_name":"Phone","level":2,"score":0.41814783215522766},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41228818893432617},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.21320819854736328},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.21437/interspeech.2020-1693","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2020-1693","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2020","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2005.09409","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2005.09409","pdf_url":"https://arxiv.org/pdf/2005.09409","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"mag:3027324582","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/2005.09409.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.2005.09409","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2005.09409","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2005.09409","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2005.09409","pdf_url":"https://arxiv.org/pdf/2005.09409","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.5600000023841858},{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.4099999964237213}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3027324582.pdf","grobid_xml":"https://content.openalex.org/works/W3027324582.grobid-xml"},"referenced_works_count":40,"referenced_works":["https://openalex.org/W1778492285","https://openalex.org/W2010188467","https://openalex.org/W2078769636","https://openalex.org/W2100768664","https://openalex.org/W2117041980","https://openalex.org/W2242818861","https://openalex.org/W2346964103","https://openalex.org/W2347098582","https://openalex.org/W2395899413","https://openalex.org/W2396043527","https://openalex.org/W2516890051","https://openalex.org/W2547039119","https://openalex.org/W2598638573","https://openalex.org/W2780786457","https://openalex.org/W2789543585","https://openalex.org/W2842511635","https://openalex.org/W2890983311","https://openalex.org/W2911249026","https://openalex.org/W2945769669","https://openalex.org/W2949382160","https://openalex.org/W2954386831","https://openalex.org/W2963112338","https://openalex.org/W2963620343","https://openalex.org/W2963799213","https://openalex.org/W2963830550","https://openalex.org/W2964026424","https://openalex.org/W2964121744","https://openalex.org/W2971775690","https://openalex.org/W2972374322","https://openalex.org/W2972867623","https://openalex.org/W2972943112","https://openalex.org/W2973026522","https://openalex.org/W2995680346","https://openalex.org/W2996383576","https://openalex.org/W3003875258","https://openalex.org/W3016181583","https://openalex.org/W3018535504","https://openalex.org/W3097692357","https://openalex.org/W3098361150","https://openalex.org/W3125709657"],"related_works":["https://openalex.org/W3095361818","https://openalex.org/W2963799213","https://openalex.org/W3161411634","https://openalex.org/W3125709657","https://openalex.org/W2936295285","https://openalex.org/W1897174440","https://openalex.org/W1606170376","https://openalex.org/W3133808965","https://openalex.org/W3024605872","https://openalex.org/W1927229584","https://openalex.org/W2118590298","https://openalex.org/W379956812","https://openalex.org/W2150111211","https://openalex.org/W2982602185","https://openalex.org/W2158243901","https://openalex.org/W2095871591","https://openalex.org/W2164364897","https://openalex.org/W2119098704","https://openalex.org/W1026216498","https://openalex.org/W2019572577"],"abstract_inverted_index":{"In":[0,118],"this":[1,36],"paper,":[2],"we":[3,14],"explore":[4],"vector":[5,41,83,174],"quantization":[6,42,84,175],"for":[7,113],"acoustic":[8,102],"unit":[9],"discovery.":[10],"Leveraging":[11],"unlabelled":[12],"data,":[13],"aim":[15],"to":[16,34,43,47,93,128,167,183],"learn":[17,94],"discrete":[18,72],"representations":[19],"of":[20,51,59,71,97,138],"speech":[21,67,98],"that":[22,173],"separate":[23],"phonetic":[24],"content":[25],"from":[26],"speaker-specific":[27],"details.":[28],"We":[29,104],"propose":[30],"two":[31],"neural":[32],"models":[33,107,124,143,182],"tackle":[35],"challenge":[37],"-":[38],"both":[39,123],"use":[40],"map":[44],"continuous":[45],"features":[46],"a":[48,57,69,95,135,148],"finite":[49],"set":[50],"codes.":[52],"The":[53,64,90,142],"first":[54],"model":[55,81],"is":[56,92,163,176],"type":[58],"vector-quantized":[60],"variational":[61],"autoencoder":[62],"(VQ-VAE).":[63],"VQ-VAE":[65],"encodes":[66],"into":[68],"sequence":[70],"units":[73],"before":[74],"reconstructing":[75],"the":[76,106,114,129,154,181],"audio":[77],"waveform.":[78],"Our":[79],"second":[80],"combines":[82],"with":[85,134],"contrastive":[86],"predictive":[87],"coding":[88],"(VQ-CPC).":[89],"idea":[91],"representation":[96],"by":[99],"predicting":[100],"future":[101],"units.":[103],"evaluate":[105],"on":[108,147],"English":[109],"and":[110,131,162,165],"Indonesian":[111],"data":[112],"ZeroSpeech":[115],"2020":[116,132],"challenge.":[117],"ABX":[119],"phone":[120],"discrimination":[121],"tests,":[122],"outperform":[125],"all":[126],"submissions":[127],"2019":[130],"challenges,":[133],"relative":[136],"improvement":[137],"more":[139],"than":[140],"30%.":[141],"also":[144],"perform":[145],"competitively":[146],"downstream":[149],"voice":[150],"conversion":[151],"task.":[152],"Of":[153],"two,":[155],"VQ-CPC":[156],"performs":[157],"slightly":[158],"better":[159],"in":[160],"general":[161],"simpler":[164],"faster":[166],"train.":[168],"Finally,":[169],"probing":[170],"experiments":[171],"show":[172],"an":[177],"effective":[178],"bottleneck,":[179],"forcing":[180],"discard":[184],"speaker":[185],"information.":[186]},"counts_by_year":[{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":6},{"year":2021,"cited_by_count":6},{"year":2020,"cited_by_count":3}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
