{"id":"https://openalex.org/W4375869189","doi":"https://doi.org/10.1109/icassp49357.2023.10095373","title":"Robust Data2VEC: Noise-Robust Speech Representation Learning for ASR by Combining Regression and Improved Contrastive Learning","display_name":"Robust Data2VEC: Noise-Robust Speech Representation Learning for ASR by Combining Regression and Improved Contrastive Learning","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4375869189","doi":"https://doi.org/10.1109/icassp49357.2023.10095373"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10095373","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095373","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5045966396","display_name":"Qiushi Zhu","orcid":"https://orcid.org/0000-0002-1196-7781"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Qiu-Shi Zhu","raw_affiliation_strings":["University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106557565","display_name":"Long Zhou","orcid":"https://orcid.org/0009-0006-1919-4943"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Long Zhou","raw_affiliation_strings":["Microsoft Research Asia"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Asia","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100436848","display_name":"Jie Zhang","orcid":"https://orcid.org/0000-0003-1124-0854"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jie Zhang","raw_affiliation_strings":["University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101635405","display_name":"Shujie Liu","orcid":"https://orcid.org/0009-0008-0785-8882"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shu-Jie Liu","raw_affiliation_strings":["Microsoft Research Asia"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Asia","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074544822","display_name":"Yu\u2010Chen Hu","orcid":"https://orcid.org/0000-0002-5055-3645"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Yu-Chen Hu","raw_affiliation_strings":["Nanyang Technological University,Singapore","Nanyang Technological University, Singapore"],"affiliations":[{"raw_affiliation_string":"Nanyang Technological University,Singapore","institution_ids":["https://openalex.org/I172675005"]},{"raw_affiliation_string":"Nanyang Technological University, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5057227915","display_name":"Li-Rong Dai","orcid":"https://orcid.org/0000-0002-0859-2827"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Li-Rong Dai","raw_affiliation_strings":["University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),NERC-SLIP,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"NERC-SLIP, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5045966396"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":3.9221,"has_fulltext":false,"cited_by_count":23,"citation_normalized_percentile":{"value":0.94778194,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8083943128585815},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.7572271227836609},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.7381269335746765},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6457344889640808},{"id":"https://openalex.org/keywords/regression","display_name":"Regression","score":0.5633887648582458},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.5041757822036743},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.502968966960907},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4638270139694214},{"id":"https://openalex.org/keywords/regression-analysis","display_name":"Regression analysis","score":0.45346179604530334},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.44964680075645447},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.44601914286613464},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.1170462965965271},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.09827923774719238},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.08969077467918396}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8083943128585815},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.7572271227836609},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.7381269335746765},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6457344889640808},{"id":"https://openalex.org/C83546350","wikidata":"https://www.wikidata.org/wiki/Q1139051","display_name":"Regression","level":2,"score":0.5633887648582458},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.5041757822036743},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.502968966960907},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4638270139694214},{"id":"https://openalex.org/C152877465","wikidata":"https://www.wikidata.org/wiki/Q208042","display_name":"Regression analysis","level":2,"score":0.45346179604530334},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.44964680075645447},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.44601914286613464},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.1170462965965271},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.09827923774719238},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.08969077467918396},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10095373","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095373","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.7300000190734863}],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":47,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W2219249508","https://openalex.org/W2526425061","https://openalex.org/W2953070460","https://openalex.org/W2963399332","https://openalex.org/W3015213852","https://openalex.org/W3032514799","https://openalex.org/W3036601975","https://openalex.org/W3090114880","https://openalex.org/W3102363610","https://openalex.org/W3170554424","https://openalex.org/W3196965931","https://openalex.org/W3197580070","https://openalex.org/W3198771897","https://openalex.org/W3205533980","https://openalex.org/W3205644108","https://openalex.org/W3206531472","https://openalex.org/W3209059054","https://openalex.org/W3209376089","https://openalex.org/W3209984917","https://openalex.org/W3211582859","https://openalex.org/W4221140371","https://openalex.org/W4221145109","https://openalex.org/W4221156109","https://openalex.org/W4225699246","https://openalex.org/W4281807971","https://openalex.org/W4286685063","https://openalex.org/W4300980246","https://openalex.org/W4301372783","https://openalex.org/W4375869127","https://openalex.org/W4385822727","https://openalex.org/W6631190155","https://openalex.org/W6688816777","https://openalex.org/W6733814495","https://openalex.org/W6766224279","https://openalex.org/W6780191644","https://openalex.org/W6780218876","https://openalex.org/W6783961830","https://openalex.org/W6783990618","https://openalex.org/W6797016505","https://openalex.org/W6802465204","https://openalex.org/W6803254773","https://openalex.org/W6809431739","https://openalex.org/W6810007534","https://openalex.org/W6811201773","https://openalex.org/W6839558599","https://openalex.org/W6845338303"],"related_works":["https://openalex.org/W3208297503","https://openalex.org/W3119773509","https://openalex.org/W2889153461","https://openalex.org/W2964117661","https://openalex.org/W4388405611","https://openalex.org/W2619127353","https://openalex.org/W4289356671","https://openalex.org/W2389155397","https://openalex.org/W2165884543","https://openalex.org/W2312753042"],"abstract_inverted_index":{"Self-supervised":[0],"pre-training":[1,32,69,99,142],"methods":[2,76],"based":[3],"on":[4,146],"contrastive":[5,40,62,79,174],"learning":[6,41,57,63,175],"or":[7],"regression":[8,65,177,192],"tasks":[9,33,66,178],"can":[10,179],"utilize":[11],"more":[12],"unlabeled":[13],"data":[14],"to":[15,77,86,92,129,136,156,184,188],"improve":[16,137,157],"the":[17,25,30,61,68,94,98,106,120,131,138,147,158,161,173,181,191],"performance":[18,159],"of":[19,28,97,122,160,172],"automatic":[20],"speech":[21,55],"recognition":[22],"(ASR).":[23],"However,":[24],"robustness":[26,96],"impact":[27],"combining":[29],"two":[31,74],"and":[34,64,124,176],"constructing":[35],"different":[36,111],"negative":[37,90,115,125,134],"samples":[38,91,135],"for":[39,53,141],"still":[42],"remains":[43],"unclear.":[44],"In":[45],"this":[46],"paper,":[47],"we":[48,72,83,127],"propose":[49,85,128],"a":[50],"noise-robust":[51],"data2vec":[52],"self-supervised":[54],"representation":[56],"by":[58,104,118],"jointly":[59],"optimizing":[60],"in":[67,164],"stage.":[70],"Furthermore,":[71],"present":[73],"improved":[75],"facilitate":[78],"learning.":[80],"More":[81],"specifically,":[82],"first":[84],"construct":[87],"patch-based":[88],"non-semantic":[89],"boost":[93],"noise":[95],"model,":[100],"which":[101],"is":[102,154],"achieved":[103],"dividing":[105],"features":[107],"into":[108],"patches":[109],"at":[110],"sizes":[112],"(i.e.,":[113],"so-called":[114],"samples).":[116],"Second,":[117],"analyzing":[119],"distribution":[121],"positive":[123],"samples,":[126],"remove":[130],"easily":[132],"distinguishable":[133],"discriminative":[139],"capacity":[140],"models.":[143],"Experimental":[144],"results":[145],"CHiME-4":[148],"dataset":[149],"show":[150],"that":[151,169],"our":[152],"method":[153],"able":[155],"pre-trained":[162],"model":[163,182],"noisy":[165],"scenarios.":[166],"We":[167],"find":[168],"joint":[170],"training":[171,190],"avoid":[180],"collapse":[183],"some":[185],"extent":[186],"compared":[187],"only":[189],"task.":[193]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":10},{"year":2023,"cited_by_count":5}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
