{"id":"https://openalex.org/W4376456759","doi":"https://doi.org/10.1109/taslp.2023.3275033","title":"A Joint Speech Enhancement and Self-Supervised Representation Learning Framework for Noise-Robust Speech Recognition","display_name":"A Joint Speech Enhancement and Self-Supervised Representation Learning Framework for Noise-Robust Speech Recognition","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4376456759","doi":"https://doi.org/10.1109/taslp.2023.3275033"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2023.3275033","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3275033","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5045966396","display_name":"Qiushi Zhu","orcid":"https://orcid.org/0000-0002-1196-7781"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Qiu-Shi Zhu","raw_affiliation_strings":["Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, China","Department of Electronic Engineering and Information Science, University of Science and Technology of China (USTC), Hefei, China"],"raw_orcid":"https://orcid.org/0000-0002-1196-7781","affiliations":[{"raw_affiliation_string":"Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"Department of Electronic Engineering and Information Science, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100436848","display_name":"Jie Zhang","orcid":"https://orcid.org/0000-0003-1124-0854"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jie Zhang","raw_affiliation_strings":["Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, China","Department of Electronic Engineering and Information Science, University of Science and Technology of China (USTC), Hefei, China"],"raw_orcid":"https://orcid.org/0000-0003-1124-0854","affiliations":[{"raw_affiliation_string":"Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"Department of Electronic Engineering and Information Science, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101577318","display_name":"Ziqiang Zhang","orcid":"https://orcid.org/0000-0003-0110-1543"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zi-Qiang Zhang","raw_affiliation_strings":["Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, China","Department of Electronic Engineering and Information Science, University of Science and Technology of China (USTC), Hefei, China"],"raw_orcid":"https://orcid.org/0000-0003-0110-1543","affiliations":[{"raw_affiliation_string":"Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"Department of Electronic Engineering and Information Science, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5057227915","display_name":"Li-Rong Dai","orcid":"https://orcid.org/0000-0002-0859-2827"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Li-Rong Dai","raw_affiliation_strings":["Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, China","Department of Electronic Engineering and Information Science, University of Science and Technology of China (USTC), Hefei, China"],"raw_orcid":"https://orcid.org/0000-0002-0859-2827","affiliations":[{"raw_affiliation_string":"Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"Department of Electronic Engineering and Information Science, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5045966396"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":7.8676,"has_fulltext":false,"cited_by_count":41,"citation_normalized_percentile":{"value":0.98322501,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":"31","issue":null,"first_page":"1927","last_page":"1939"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7988834381103516},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.7734198570251465},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6467441320419312},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.5439491271972656},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.5379003286361694},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5170384049415588},{"id":"https://openalex.org/keywords/waveform","display_name":"Waveform","score":0.4161931276321411},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3876188099384308},{"id":"https://openalex.org/keywords/noise-reduction","display_name":"Noise reduction","score":0.24151107668876648}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7988834381103516},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.7734198570251465},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6467441320419312},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.5439491271972656},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.5379003286361694},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5170384049415588},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.4161931276321411},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3876188099384308},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.24151107668876648},{"id":"https://openalex.org/C554190296","wikidata":"https://www.wikidata.org/wiki/Q47528","display_name":"Radar","level":2,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2023.3275033","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3275033","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.7799999713897705,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[{"id":"https://openalex.org/G634576251","display_name":null,"funder_award_id":"62101523","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":76,"referenced_works":["https://openalex.org/W854541894","https://openalex.org/W1494198834","https://openalex.org/W1897240248","https://openalex.org/W1974387177","https://openalex.org/W2006129368","https://openalex.org/W2033875152","https://openalex.org/W2102113734","https://openalex.org/W2124509324","https://openalex.org/W2127141656","https://openalex.org/W2128653836","https://openalex.org/W2153894152","https://openalex.org/W2160815625","https://openalex.org/W2193413348","https://openalex.org/W2296167893","https://openalex.org/W2526425061","https://openalex.org/W2547875792","https://openalex.org/W2746457594","https://openalex.org/W2777662428","https://openalex.org/W2802304149","https://openalex.org/W2892009249","https://openalex.org/W2898446161","https://openalex.org/W2916979304","https://openalex.org/W2936774411","https://openalex.org/W2937484199","https://openalex.org/W2943554574","https://openalex.org/W2952218014","https://openalex.org/W2955058313","https://openalex.org/W2962935966","https://openalex.org/W2963399332","https://openalex.org/W2971033911","https://openalex.org/W2971417062","https://openalex.org/W2972320711","https://openalex.org/W2972412503","https://openalex.org/W2972943112","https://openalex.org/W2973049979","https://openalex.org/W2979476256","https://openalex.org/W3012383481","https://openalex.org/W3015213852","https://openalex.org/W3015265920","https://openalex.org/W3016181583","https://openalex.org/W3032514799","https://openalex.org/W3036601975","https://openalex.org/W3041561163","https://openalex.org/W3097945073","https://openalex.org/W3099330747","https://openalex.org/W3102342027","https://openalex.org/W3109196171","https://openalex.org/W3163142165","https://openalex.org/W3198771897","https://openalex.org/W3205533980","https://openalex.org/W3206252155","https://openalex.org/W3206531472","https://openalex.org/W3207558756","https://openalex.org/W3209059054","https://openalex.org/W3209376089","https://openalex.org/W3209984917","https://openalex.org/W4221140371","https://openalex.org/W4225699246","https://openalex.org/W4226390724","https://openalex.org/W4229003981","https://openalex.org/W4297808394","https://openalex.org/W4297841603","https://openalex.org/W4312356750","https://openalex.org/W4312881242","https://openalex.org/W4385245566","https://openalex.org/W6623517193","https://openalex.org/W6675365184","https://openalex.org/W6687566353","https://openalex.org/W6729448088","https://openalex.org/W6739901393","https://openalex.org/W6747381837","https://openalex.org/W6766224279","https://openalex.org/W6769196770","https://openalex.org/W6780218876","https://openalex.org/W6811201773","https://openalex.org/W6844194202"],"related_works":["https://openalex.org/W1974895211","https://openalex.org/W2129841057","https://openalex.org/W3040712279","https://openalex.org/W2176409448","https://openalex.org/W2364769705","https://openalex.org/W2056136368","https://openalex.org/W2374664672","https://openalex.org/W3096184950","https://openalex.org/W4231424160","https://openalex.org/W2275432853"],"abstract_inverted_index":{"Though":[0],"speech":[1,9,25,111],"enhancement":[2],"(SE)":[3],"can":[4,133,188],"be":[5,159],"used":[6],"to":[7,38,74,102,123,146,158,198],"improve":[8,39,75,189],"quality":[10],"in":[11,78,82],"noisy":[12,79,88,128,179,195],"environments,":[13],"it":[14],"may":[15],"also":[16],"cause":[17],"distortions":[18],"that":[19,71,182],"degrade":[20],"the":[21,32,40,47,50,83,86,91,99,104,108,114,125,136,147,153,183,190],"performance":[22,77,192],"of":[23,43,49,53,127,150,162],"automatic":[24],"recognition":[26],"(ASR)":[27],"models.":[28,45],"Self-supervised":[29],"pre-training,":[30],"on":[31,174],"other":[33],"hand,":[34],"has":[35],"been":[36],"shown":[37],"noise":[41,201],"robustness":[42],"ASR":[44,76,166,191],"However,":[46],"potential":[48],"(optimal)":[51],"integration":[52],"SE":[54,73,95],"and":[55,129,177],"self-supervised":[56,68,100],"pre-training":[57,69,84],"still":[58],"remains":[59],"unclear.":[60],"In":[61],"this":[62],"paper,":[63],"we":[64,117],"propose":[65,118],"a":[66,119,160,199],"novel":[67],"framework":[70],"incorporates":[72],"environments.":[80],"First,":[81],"phase":[85],"original":[87],"waveform":[89,92],"or":[90],"obtained":[93],"by":[94,140],"is":[96],"fed":[97],"into":[98],"model":[101],"learn":[103],"contextual":[105],"representation,":[106],"where":[107],"quantized":[109],"clean":[110],"acts":[112],"as":[113],"target.":[115],"Second,":[116],"dual-attention":[120],"fusion":[121],"method":[122,155],"fuse":[124],"features":[126],"enhanced":[130,169],"speech,":[131],"which":[132],"compensate":[134],"for":[135],"information":[137],"loss":[138],"caused":[139],"separately":[141],"using":[142],"individual":[143],"modules.":[144],"Due":[145],"flexible":[148],"exploitation":[149],"clean/noisy/enhanced":[151],"branches,":[152],"proposed":[154,184],"turns":[156],"out":[157],"generalization":[161],"some":[163],"existing":[164],"noise-robust":[165],"models,":[167],"e.g.,":[168],"wav2vec2.0.":[170],"Finally,":[171],"experimental":[172],"results":[173],"both":[175],"synthetic":[176],"real":[178],"datasets":[180],"show":[181],"joint":[185],"training":[186],"approach":[187],"under":[193],"various":[194],"settings,":[196],"leading":[197],"stronger":[200],"robustness.":[202]},"counts_by_year":[{"year":2026,"cited_by_count":5},{"year":2025,"cited_by_count":15},{"year":2024,"cited_by_count":16},{"year":2023,"cited_by_count":5}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
