{"id":"https://openalex.org/W3112613336","doi":"https://doi.org/10.21437/interspeech.2021-50","title":"Towards Unsupervised Phone and Word Segmentation Using Self-Supervised Vector-Quantized Neural Networks","display_name":"Towards Unsupervised Phone and Word Segmentation Using Self-Supervised Vector-Quantized Neural Networks","publication_year":2021,"publication_date":"2021-08-27","ids":{"openalex":"https://openalex.org/W3112613336","doi":"https://doi.org/10.21437/interspeech.2021-50","mag":"3112613336"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2021-50","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2021-50","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2021","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2012.07551","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5040305929","display_name":"Herman Kamper","orcid":"https://orcid.org/0000-0003-2980-3475"},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":true,"raw_author_name":"Herman Kamper","raw_affiliation_strings":["StellenBosch University"],"affiliations":[{"raw_affiliation_string":"StellenBosch University","institution_ids":["https://openalex.org/I26092322"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5046382731","display_name":"Benjamin van Niekerk","orcid":"https://orcid.org/0000-0001-9207-6309"},"institutions":[{"id":"https://openalex.org/I26092322","display_name":"Stellenbosch University","ror":"https://ror.org/05bk57929","country_code":"ZA","type":"education","lineage":["https://openalex.org/I26092322"]}],"countries":["ZA"],"is_corresponding":false,"raw_author_name":"Benjamin van Niekerk","raw_affiliation_strings":["Stellenbosch University, Stellenbosch, South Africa"],"affiliations":[{"raw_affiliation_string":"Stellenbosch University, Stellenbosch, South Africa","institution_ids":["https://openalex.org/I26092322"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5040305929"],"corresponding_institution_ids":["https://openalex.org/I26092322"],"apc_list":null,"apc_paid":null,"fwci":1.2346,"has_fulltext":true,"cited_by_count":8,"citation_normalized_percentile":{"value":0.7816699,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1539","last_page":"1543"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.7792642116546631},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7778671979904175},{"id":"https://openalex.org/keywords/phone","display_name":"Phone","score":0.6220231056213379},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.5700104236602783},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.546343207359314},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5457755923271179},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.5454275608062744},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5253586173057556},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.464072048664093},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4638654589653015},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4614168107509613},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.4540785253047943},{"id":"https://openalex.org/keywords/speech-segmentation","display_name":"Speech segmentation","score":0.4519859552383423},{"id":"https://openalex.org/keywords/feature-vector","display_name":"Feature vector","score":0.44800692796707153},{"id":"https://openalex.org/keywords/text-segmentation","display_name":"Text segmentation","score":0.4463995099067688},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.15901410579681396}],"concepts":[{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.7792642116546631},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7778671979904175},{"id":"https://openalex.org/C2778707766","wikidata":"https://www.wikidata.org/wiki/Q202064","display_name":"Phone","level":2,"score":0.6220231056213379},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.5700104236602783},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.546343207359314},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5457755923271179},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.5454275608062744},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5253586173057556},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.464072048664093},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4638654589653015},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4614168107509613},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.4540785253047943},{"id":"https://openalex.org/C207030507","wikidata":"https://www.wikidata.org/wiki/Q2266173","display_name":"Speech segmentation","level":3,"score":0.4519859552383423},{"id":"https://openalex.org/C83665646","wikidata":"https://www.wikidata.org/wiki/Q42139305","display_name":"Feature vector","level":2,"score":0.44800692796707153},{"id":"https://openalex.org/C98501671","wikidata":"https://www.wikidata.org/wiki/Q1948408","display_name":"Text segmentation","level":3,"score":0.4463995099067688},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.15901410579681396},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.21437/interspeech.2021-50","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2021-50","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2021","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2012.07551","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2012.07551","pdf_url":"https://arxiv.org/pdf/2012.07551","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"mag:3112613336","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/2012.07551.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.2012.07551","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2012.07551","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2012.07551","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2012.07551","pdf_url":"https://arxiv.org/pdf/2012.07551","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"score":0.5199999809265137,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"},{"score":0.4699999988079071,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320320671","display_name":"National Research Foundation","ror":"https://ror.org/05s0g1g46"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3112613336.pdf","grobid_xml":"https://content.openalex.org/works/W3112613336.grobid-xml"},"referenced_works_count":45,"referenced_works":["https://openalex.org/W130754613","https://openalex.org/W1796128977","https://openalex.org/W2010188467","https://openalex.org/W2025482506","https://openalex.org/W2052697931","https://openalex.org/W2117126688","https://openalex.org/W2126377586","https://openalex.org/W2145410271","https://openalex.org/W2242818861","https://openalex.org/W2345913943","https://openalex.org/W2395899413","https://openalex.org/W2396043527","https://openalex.org/W2400549570","https://openalex.org/W2404952642","https://openalex.org/W2407151108","https://openalex.org/W2468716020","https://openalex.org/W2478415332","https://openalex.org/W2483390977","https://openalex.org/W2516890051","https://openalex.org/W2620638943","https://openalex.org/W2780786457","https://openalex.org/W2842511635","https://openalex.org/W2962799131","https://openalex.org/W2963137467","https://openalex.org/W2963799213","https://openalex.org/W2964169922","https://openalex.org/W2971775690","https://openalex.org/W2972764223","https://openalex.org/W2972867623","https://openalex.org/W2972943112","https://openalex.org/W2973026522","https://openalex.org/W2996383576","https://openalex.org/W3006094508","https://openalex.org/W3008499099","https://openalex.org/W3018535504","https://openalex.org/W3093096176","https://openalex.org/W3095361818","https://openalex.org/W3096196861","https://openalex.org/W3096656254","https://openalex.org/W3097286738","https://openalex.org/W3097485645","https://openalex.org/W3097692357","https://openalex.org/W3098361150","https://openalex.org/W3102519966","https://openalex.org/W3125709657"],"related_works":["https://openalex.org/W3198134274","https://openalex.org/W2770180314","https://openalex.org/W3202232410","https://openalex.org/W3099193570","https://openalex.org/W1961881037","https://openalex.org/W3020818372","https://openalex.org/W2952218918","https://openalex.org/W2034091650","https://openalex.org/W2921676720","https://openalex.org/W2513716658","https://openalex.org/W3003516416","https://openalex.org/W3107725261","https://openalex.org/W3010611153","https://openalex.org/W3181737292","https://openalex.org/W2924027593","https://openalex.org/W3141904994","https://openalex.org/W2982106100","https://openalex.org/W2950728047","https://openalex.org/W2963445119","https://openalex.org/W1972293533"],"abstract_inverted_index":{"We":[0,12,85],"investigate":[1],"segmenting":[2],"and":[3,112],"clustering":[4],"speech":[5,41],"into":[6,42],"low-bitrate":[7],"phone-like":[8],"sequences":[9],"without":[10,95],"supervision.":[11],"specifically":[13],"constrain":[14],"pretrained":[15],"self-supervised":[16],"vector-quantized":[17],"(VQ)":[18],"neural":[19],"networks":[20],"so":[21],"that":[22,87],"blocks":[23],"of":[24,39,61,101],"contiguous":[25],"feature":[26],"vectors":[27],"are":[28,48,54,63],"assigned":[29],"to":[30,70,79,115,137],"the":[31,40,51,138],"same":[32],"code,":[33],"thereby":[34],"giving":[35],"a":[36,58,72,76,98,116,146,153],"variable-rate":[37],"segmentation":[38,46,90,119],"discrete":[43],"units.":[44],"Two":[45],"methods":[47,91],"considered.":[49],"In":[50],"first,":[52],"features":[53],"greedily":[55],"merged":[56],"until":[57],"prespecified":[59],"number":[60],"segments":[62],"reached.":[64],"The":[65,121],"second":[66],"uses":[67],"dynamic":[68,123],"programming":[69,124],"optimize":[71],"squared":[73],"error":[74],"with":[75],"penalty":[77],"term":[78],"encourage":[80],"fewer":[81],"but":[82],"longer":[83],"segments.":[84],"show":[86],"these":[88],"VQ":[89],"can":[92],"be":[93],"used":[94],"alteration":[96],"across":[97],"wide":[99],"range":[100],"tasks:":[102],"unsupervised":[103],"phone":[104,107],"segmentation,":[105],"ABX":[106],"discrimination,":[108,111],"same-different":[109],"word":[110,118],"as":[113],"inputs":[114],"symbolic":[117],"algorithm.":[120],"penalized":[122],"method":[125],"generally":[126],"performs":[127],"best.":[128],"While":[129],"performance":[130],"on":[131],"individual":[132],"tasks":[133,145],"is":[134,150],"only":[135],"comparable":[136],"state-of-the-art":[139],"in":[140,143],"some":[141],"cases,":[142],"all":[144],"reasonable":[147],"competing":[148],"approach":[149],"outperformed":[151],"at":[152],"substantially":[154],"lower":[155],"bitrate.":[156]},"counts_by_year":[{"year":2022,"cited_by_count":3},{"year":2021,"cited_by_count":5}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
