{"id":"https://openalex.org/W4416251752","doi":"https://doi.org/10.1109/waspaa66052.2025.11230988","title":"Robust One-step Speech Enhancement via Consistency Distillation","display_name":"Robust One-step Speech Enhancement via Consistency Distillation","publication_year":2025,"publication_date":"2025-10-12","ids":{"openalex":"https://openalex.org/W4416251752","doi":"https://doi.org/10.1109/waspaa66052.2025.11230988"},"language":null,"primary_location":{"id":"doi:10.1109/waspaa66052.2025.11230988","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230988","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100673719","display_name":"Liang Xu","orcid":"https://orcid.org/0000-0001-8171-0247"},"institutions":[{"id":"https://openalex.org/I41156924","display_name":"Victoria University of Wellington","ror":"https://ror.org/0040r6f76","country_code":"NZ","type":"education","lineage":["https://openalex.org/I41156924"]}],"countries":["NZ"],"is_corresponding":true,"raw_author_name":"Liang Xu","raw_affiliation_strings":["Victoria University of Wellington,School of Engineering and Computer Science,Wellington,New Zealand,6140"],"affiliations":[{"raw_affiliation_string":"Victoria University of Wellington,School of Engineering and Computer Science,Wellington,New Zealand,6140","institution_ids":["https://openalex.org/I41156924"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064315768","display_name":"Longfei Yan","orcid":"https://orcid.org/0000-0003-4273-198X"},"institutions":[{"id":"https://openalex.org/I41156924","display_name":"Victoria University of Wellington","ror":"https://ror.org/0040r6f76","country_code":"NZ","type":"education","lineage":["https://openalex.org/I41156924"]}],"countries":["NZ"],"is_corresponding":false,"raw_author_name":"Longfei Felix Yan","raw_affiliation_strings":["Victoria University of Wellington,School of Engineering and Computer Science,Wellington,New Zealand,6140"],"affiliations":[{"raw_affiliation_string":"Victoria University of Wellington,School of Engineering and Computer Science,Wellington,New Zealand,6140","institution_ids":["https://openalex.org/I41156924"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5087492771","display_name":"W. Bastiaan Kleijn","orcid":"https://orcid.org/0000-0002-1973-3920"},"institutions":[{"id":"https://openalex.org/I41156924","display_name":"Victoria University of Wellington","ror":"https://ror.org/0040r6f76","country_code":"NZ","type":"education","lineage":["https://openalex.org/I41156924"]}],"countries":["NZ"],"is_corresponding":false,"raw_author_name":"W. Bastiaan Kleijn","raw_affiliation_strings":["Victoria University of Wellington,School of Engineering and Computer Science,Wellington,New Zealand,6140"],"affiliations":[{"raw_affiliation_string":"Victoria University of Wellington,School of Engineering and Computer Science,Wellington,New Zealand,6140","institution_ids":["https://openalex.org/I41156924"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100673719"],"corresponding_institution_ids":["https://openalex.org/I41156924"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.4552039,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9923999905586243,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9923999905586243,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.002300000051036477,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.0013000000035390258,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.7148000001907349},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6758000254631042},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.5544999837875366},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5121999979019165},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5091999769210815},{"id":"https://openalex.org/keywords/distillation","display_name":"Distillation","score":0.435699999332428},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.392300009727478},{"id":"https://openalex.org/keywords/trajectory","display_name":"Trajectory","score":0.3758000135421753}],"concepts":[{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.7148000001907349},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6962000131607056},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6758000254631042},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.5544999837875366},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5121999979019165},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5091999769210815},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.435699999332428},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4032000005245209},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.392300009727478},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.3758000135421753},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3504999876022339},{"id":"https://openalex.org/C2780069185","wikidata":"https://www.wikidata.org/wiki/Q7977945","display_name":"Equivalence (formal languages)","level":2,"score":0.3474999964237213},{"id":"https://openalex.org/C37279795","wikidata":"https://www.wikidata.org/wiki/Q2492305","display_name":"Consistency model","level":3,"score":0.33320000767707825},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.3061999976634979},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2937000095844269},{"id":"https://openalex.org/C31388003","wikidata":"https://www.wikidata.org/wiki/Q7624548","display_name":"Strong consistency","level":3,"score":0.28290000557899475},{"id":"https://openalex.org/C117765406","wikidata":"https://www.wikidata.org/wiki/Q5362437","display_name":"Generalization error","level":3,"score":0.27570000290870667},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.2741999924182892},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.26649999618530273},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2517000138759613},{"id":"https://openalex.org/C165838908","wikidata":"https://www.wikidata.org/wiki/Q736777","display_name":"Calibration","level":2,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/waspaa66052.2025.11230988","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230988","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W1552314771","https://openalex.org/W1974387177","https://openalex.org/W2013035813","https://openalex.org/W2151542770","https://openalex.org/W2516001803","https://openalex.org/W2603567530","https://openalex.org/W2767831780","https://openalex.org/W2892110446","https://openalex.org/W2952218014","https://openalex.org/W2964058413","https://openalex.org/W3097906045","https://openalex.org/W3161480375","https://openalex.org/W3197912330","https://openalex.org/W3202278141","https://openalex.org/W4200483526","https://openalex.org/W4221144097","https://openalex.org/W4225302959","https://openalex.org/W4232282348","https://openalex.org/W4372341094","https://openalex.org/W4380434618","https://openalex.org/W4384080510","https://openalex.org/W4386764256","https://openalex.org/W4392903345","https://openalex.org/W4392903897","https://openalex.org/W4402111353","https://openalex.org/W4402112483","https://openalex.org/W4402112506","https://openalex.org/W4403126543","https://openalex.org/W4406795023","https://openalex.org/W4408354850"],"related_works":[],"abstract_inverted_index":{"Diffusion":[0],"models":[1,44],"have":[2],"shown":[3],"strong":[4],"performance":[5,156,175],"in":[6,132,176],"speech":[7,146,179],"enhancement,":[8,147],"but":[9],"their":[10],"real-time":[11],"applicability":[12],"has":[13,22],"been":[14],"limited":[15],"by":[16,29],"multistep":[17],"iterative":[18],"sampling.":[19],"Consistency":[20,83],"distillation":[21,142],"recently":[23],"emerged":[24],"as":[25],"a":[26,31,36,85,90,97],"promising":[27],"alternative":[28],"distilling":[30,89],"one-step":[32,91,113,140],"consistency":[33,43,92,141],"model":[34,114,131,143,172],"from":[35,67,124],"multi-step":[37],"diffusion-based":[38,145],"teacher":[39,54,69,130,161],"model.":[40,70,93,162],"However,":[41],"distilled":[42],"are":[45],"inherently":[46],"biased":[47],"towards":[48],"the":[49,53,68,103,112,129,137,165,170],"sampling":[50],"trajectory":[51,100],"of":[52,178],"model,":[55],"making":[56],"them":[57],"less":[58],"robust":[59],"to":[60,64,101,106,122,158],"noise":[61],"and":[62,127,154,192],"prone":[63],"inheriting":[65],"inaccuracies":[66],"To":[71],"address":[72],"this":[73],"limitation,":[74],"we":[75,95,109],"propose":[76],"ROSE-CD:":[77],"Robust":[78],"One-step":[79],"Speech":[80],"Enhancement":[81],"via":[82],"Distillation,":[84],"novel":[86],"approach":[87],"for":[88,144],"Specifically,":[94],"introduce":[96],"randomized":[98],"learning":[99],"improve":[102],"model\u2019s":[104],"robustness":[105],"noise.":[107],"Furthermore,":[108],"jointly":[110],"optimize":[111],"with":[115],"two":[116],"time-domain":[117],"auxiliary":[118],"losses,":[119],"enabling":[120],"it":[121],"recover":[123],"teacher-induced":[125],"errors":[126],"surpass":[128],"overall":[133],"performance.":[134],"This":[135],"is":[136,185],"first":[138],"pure":[139],"achieving":[148],"54":[149],"times":[150],"faster":[151],"inference":[152],"speed":[153],"superior":[155],"compared":[157],"its":[159,182],"30-step":[160],"Experiments":[163],"on":[164,187],"VoiceBank-DEMAND":[166],"dataset":[167,191],"demonstrate":[168],"that":[169],"proposed":[171],"achieves":[173],"state-of-the-art":[174],"terms":[177],"quality.":[180],"Moreover,":[181],"generalization":[183],"ability":[184],"validated":[186],"both":[188],"an":[189],"out-of-domain":[190],"real-world":[193],"noisy":[194],"recordings.":[195]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-14T00:00:00"}
