{"id":"https://openalex.org/W4226033575","doi":"https://doi.org/10.1109/asru51503.2021.9688253","title":"w2v-BERT: Combining Contrastive Learning and Masked Language Modeling for Self-Supervised Speech Pre-Training","display_name":"w2v-BERT: Combining Contrastive Learning and Masked Language Modeling for Self-Supervised Speech Pre-Training","publication_year":2021,"publication_date":"2021-12-13","ids":{"openalex":"https://openalex.org/W4226033575","doi":"https://doi.org/10.1109/asru51503.2021.9688253"},"language":"en","primary_location":{"id":"doi:10.1109/asru51503.2021.9688253","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru51503.2021.9688253","pdf_url":null,"source":{"id":"https://openalex.org/S4363606113","display_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5008107321","display_name":"Yu-An Chung","orcid":"https://orcid.org/0000-0001-9451-7956"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yu-An Chung","raw_affiliation_strings":["MIT Computer Science and Artificial Intelligence Laboratory","Google Brain"],"affiliations":[{"raw_affiliation_string":"MIT Computer Science and Artificial Intelligence Laboratory","institution_ids":[]},{"raw_affiliation_string":"Google Brain","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100433648","display_name":"Yu Zhang","orcid":"https://orcid.org/0000-0002-9505-1833"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yu Zhang","raw_affiliation_strings":["Google Brain"],"affiliations":[{"raw_affiliation_string":"Google Brain","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100750907","display_name":"Wei Han","orcid":"https://orcid.org/0000-0002-4201-9645"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Wei Han","raw_affiliation_strings":["Google Brain"],"affiliations":[{"raw_affiliation_string":"Google Brain","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027763497","display_name":"Chung\u2010Cheng Chiu","orcid":"https://orcid.org/0000-0001-9729-4778"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chung-Cheng Chiu","raw_affiliation_strings":["Google Brain"],"affiliations":[{"raw_affiliation_string":"Google Brain","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048771433","display_name":"James Qin","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"James Qin","raw_affiliation_strings":["Google Brain"],"affiliations":[{"raw_affiliation_string":"Google Brain","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112901893","display_name":"Ruoming Pang","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ruoming Pang","raw_affiliation_strings":["Google Brain"],"affiliations":[{"raw_affiliation_string":"Google Brain","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5010253402","display_name":"Yonghui Wu","orcid":"https://orcid.org/0000-0002-6780-6135"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yonghui Wu","raw_affiliation_strings":["Google Brain"],"affiliations":[{"raw_affiliation_string":"Google Brain","institution_ids":["https://openalex.org/I1291425158"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5008107321"],"corresponding_institution_ids":["https://openalex.org/I1291425158"],"apc_list":null,"apc_paid":null,"fwci":23.9823,"has_fulltext":false,"cited_by_count":256,"citation_normalized_percentile":{"value":0.99759726,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"244","last_page":"250"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8221430778503418},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.7031056880950928},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.58524489402771},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5311288833618164},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.52735835313797},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5084251761436462},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.46254441142082214},{"id":"https://openalex.org/keywords/test-set","display_name":"Test set","score":0.4529997706413269},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4418858587741852},{"id":"https://openalex.org/keywords/discretization","display_name":"Discretization","score":0.4344298839569092},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4152686297893524},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3420558273792267}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8221430778503418},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.7031056880950928},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.58524489402771},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5311288833618164},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.52735835313797},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5084251761436462},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.46254441142082214},{"id":"https://openalex.org/C169903167","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Test set","level":2,"score":0.4529997706413269},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4418858587741852},{"id":"https://openalex.org/C73000952","wikidata":"https://www.wikidata.org/wiki/Q17007827","display_name":"Discretization","level":2,"score":0.4344298839569092},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4152686297893524},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3420558273792267},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru51503.2021.9688253","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru51503.2021.9688253","pdf_url":null,"source":{"id":"https://openalex.org/S4363606113","display_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6100000143051147,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":58,"referenced_works":["https://openalex.org/W165878654","https://openalex.org/W1494198834","https://openalex.org/W1522301498","https://openalex.org/W1828163288","https://openalex.org/W1836465849","https://openalex.org/W2064675550","https://openalex.org/W2088622183","https://openalex.org/W2101210369","https://openalex.org/W2111316763","https://openalex.org/W2121879602","https://openalex.org/W2842511635","https://openalex.org/W2896457183","https://openalex.org/W2928408492","https://openalex.org/W2936774411","https://openalex.org/W2940322076","https://openalex.org/W2962907457","https://openalex.org/W2972943112","https://openalex.org/W2973049979","https://openalex.org/W2979476256","https://openalex.org/W2982223350","https://openalex.org/W2988736778","https://openalex.org/W2991213871","https://openalex.org/W2995181338","https://openalex.org/W3003875258","https://openalex.org/W3015265920","https://openalex.org/W3015522062","https://openalex.org/W3015995734","https://openalex.org/W3016011332","https://openalex.org/W3026041220","https://openalex.org/W3035202887","https://openalex.org/W3036601975","https://openalex.org/W3041561163","https://openalex.org/W3093579165","https://openalex.org/W3097777922","https://openalex.org/W3112034174","https://openalex.org/W3128768055","https://openalex.org/W3157697407","https://openalex.org/W3160525311","https://openalex.org/W3209059054","https://openalex.org/W4210463634","https://openalex.org/W4293569541","https://openalex.org/W4297808394","https://openalex.org/W4320930577","https://openalex.org/W4385245566","https://openalex.org/W6631190155","https://openalex.org/W6638667902","https://openalex.org/W6638749077","https://openalex.org/W6739901393","https://openalex.org/W6746023985","https://openalex.org/W6751104502","https://openalex.org/W6755207826","https://openalex.org/W6769196770","https://openalex.org/W6770506093","https://openalex.org/W6770514103","https://openalex.org/W6771812881","https://openalex.org/W6780218876","https://openalex.org/W6784532283","https://openalex.org/W6786669483"],"related_works":["https://openalex.org/W2965546495","https://openalex.org/W4389116644","https://openalex.org/W2153315159","https://openalex.org/W3103844505","https://openalex.org/W259157601","https://openalex.org/W4205463238","https://openalex.org/W2006251942","https://openalex.org/W2113687551","https://openalex.org/W2112752961","https://openalex.org/W2342291550"],"abstract_inverted_index":{"Motivated":[0],"by":[1,113,198],"the":[2,37,40,57,60,74,115,140,145,150,177,185],"success":[3],"of":[4,52],"masked":[5,70],"language":[6,12],"modeling":[7],"(MLM)":[8],"in":[9,109],"pre-training":[10,83],"natural":[11],"processing":[13],"models,":[14],"we":[15],"propose":[16],"w2v-BERT":[17,26,105,129,191],"that":[18,30,128],"explores":[19],"MLM":[20],"for":[21],"self-supervised":[22,117],"speech":[23,46,54,65,82],"representation":[24],"learning.":[25],"is":[27],"a":[28,49,69],"framework":[29],"combines":[31],"contrastive":[32,120],"learning":[33],"and":[34,56,94,122,165,179],"MLM,":[35],"where":[36],"former":[38],"trains":[39,59],"model":[41,61,168],"to":[42,62,79,134,157,171,184],"discretize":[43],"input":[44],"continuous":[45],"signals":[47],"into":[48],"finite":[50],"set":[51],"discriminative":[53],"tokens,":[55],"latter":[58],"learn":[63],"contextualized":[64],"representations":[66],"via":[67],"solving":[68,114],"prediction":[71],"task":[72,121],"consuming":[73],"discretized":[75],"tokens.":[76],"In":[77,153],"contrast":[78],"existing":[80],"MLM-based":[81],"frameworks":[84],"such":[85,160],"as":[86,149,161],"HuBERT,":[87,166],"which":[88,99],"relies":[89],"on":[90,139,176],"an":[91,110],"iterative":[92],"re-clustering":[93],"re-training":[95],"process,":[96],"or":[97],"vq-wav2vec,":[98],"concatenates":[100],"two":[101,116],"separately":[102],"trained":[103],"modules,":[104],"can":[106],"be":[107],"optimized":[108],"end-to-end":[111],"fashion":[112],"tasks":[118],"(the":[119],"MLM)":[123],"simultaneously.":[124],"Our":[125],"experiments":[126],"show":[127],"achieves":[130],"competitive":[131],"results":[132],"compared":[133,156],"current":[135],"state-of-the-art":[136],"pre-trained":[137],"models":[138,159],"LibriSpeech":[141],"benchmarks":[142],"when":[143,155],"using":[144],"Libri-Light":[146],"60k":[147],"corpus":[148],"unsupervised":[151],"data.":[152],"particular,":[154],"published":[158],"conformer-based":[162,195],"wav2vec":[163,196],"2.0":[164,197],"our":[167,193],"shows":[169],"5%":[170],"10%":[172],"relative":[173],"WER":[174],"reduction":[175],"test-clean":[178],"test-other":[180],"subsets.":[181],"When":[182],"applied":[183],"Google's":[186],"Voice":[187],"Search":[188],"traffic":[189],"dataset,":[190],"outperforms":[192],"internal":[194],"more":[199],"than":[200],"30%":[201],"relatively.":[202]},"counts_by_year":[{"year":2026,"cited_by_count":8},{"year":2025,"cited_by_count":59},{"year":2024,"cited_by_count":78},{"year":2023,"cited_by_count":71},{"year":2022,"cited_by_count":40}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
