{"id":"https://openalex.org/W3197734536","doi":"https://doi.org/10.21437/interspeech.2021-259","title":"Speech Enhancement with Weakly Labelled Data from AudioSet","display_name":"Speech Enhancement with Weakly Labelled Data from AudioSet","publication_year":2021,"publication_date":"2021-08-27","ids":{"openalex":"https://openalex.org/W3197734536","doi":"https://doi.org/10.21437/interspeech.2021-259","mag":"3197734536"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2021-259","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2021-259","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2021","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5072482416","display_name":"Qiuqiang Kong","orcid":"https://orcid.org/0000-0003-2864-0475"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Qiuqiang Kong","raw_affiliation_strings":["ByteDance, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ByteDance, Shanghai, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009048990","display_name":"Haohe Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Haohe Liu","raw_affiliation_strings":["ByteDance, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ByteDance, Shanghai, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091528537","display_name":"Xingjian Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xingjian Du","raw_affiliation_strings":["ByteDance, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ByteDance, Shanghai, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100379262","display_name":"Li Chen","orcid":"https://orcid.org/0000-0002-5842-838X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li Chen","raw_affiliation_strings":["ByteDance, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ByteDance, Shanghai, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035534911","display_name":"Rui Xia","orcid":"https://orcid.org/0000-0002-0621-1058"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rui Xia","raw_affiliation_strings":["ByteDance, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ByteDance, Shanghai, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101664515","display_name":"Yuxuan Wang","orcid":"https://orcid.org/0009-0005-3508-3736"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuxuan Wang","raw_affiliation_strings":["ByteDance, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ByteDance, Shanghai, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5072482416"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.0665,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.77045301,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"191","last_page":"195"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.788718581199646},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7772728204727173},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.6839449405670166},{"id":"https://openalex.org/keywords/intelligibility","display_name":"Intelligibility (philosophy)","score":0.6350135803222656},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.5696478486061096},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.48876190185546875},{"id":"https://openalex.org/keywords/psqm","display_name":"PSQM","score":0.4881304204463959},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.48091360926628113},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.45550134778022766},{"id":"https://openalex.org/keywords/audio-mining","display_name":"Audio mining","score":0.4511151611804962},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.41974273324012756},{"id":"https://openalex.org/keywords/pesq","display_name":"PESQ","score":0.4166858494281769},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.38958555459976196},{"id":"https://openalex.org/keywords/linear-predictive-coding","display_name":"Linear predictive coding","score":0.38550955057144165},{"id":"https://openalex.org/keywords/noise-reduction","display_name":"Noise reduction","score":0.15034809708595276}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.788718581199646},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7772728204727173},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.6839449405670166},{"id":"https://openalex.org/C60048801","wikidata":"https://www.wikidata.org/wiki/Q1433889","display_name":"Intelligibility (philosophy)","level":2,"score":0.6350135803222656},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.5696478486061096},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.48876190185546875},{"id":"https://openalex.org/C108699837","wikidata":"https://www.wikidata.org/wiki/Q7120750","display_name":"PSQM","level":4,"score":0.4881304204463959},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.48091360926628113},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.45550134778022766},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.4511151611804962},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.41974273324012756},{"id":"https://openalex.org/C103734657","wikidata":"https://www.wikidata.org/wiki/Q2739975","display_name":"PESQ","level":4,"score":0.4166858494281769},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38958555459976196},{"id":"https://openalex.org/C59883199","wikidata":"https://www.wikidata.org/wiki/Q1826438","display_name":"Linear predictive coding","level":3,"score":0.38550955057144165},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.15034809708595276},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2021-259","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2021-259","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2021","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.8100000023841858,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W1495679096","https://openalex.org/W1522301498","https://openalex.org/W1546892833","https://openalex.org/W1897240248","https://openalex.org/W2013608223","https://openalex.org/W2044893557","https://openalex.org/W2094721231","https://openalex.org/W2144404214","https://openalex.org/W2153894152","https://openalex.org/W2405774341","https://openalex.org/W2516342150","https://openalex.org/W2593116425","https://openalex.org/W2774707525","https://openalex.org/W2937484199","https://openalex.org/W2952218014","https://openalex.org/W2963103134","https://openalex.org/W2963321191","https://openalex.org/W2963341071","https://openalex.org/W2963453742","https://openalex.org/W3004910802","https://openalex.org/W3093839391","https://openalex.org/W3094550259","https://openalex.org/W3127686677","https://openalex.org/W3147539069","https://openalex.org/W3209141406","https://openalex.org/W4289242435"],"related_works":["https://openalex.org/W2130024686","https://openalex.org/W4315606010","https://openalex.org/W2603663739","https://openalex.org/W4319347033","https://openalex.org/W2341426843","https://openalex.org/W2020464095","https://openalex.org/W4282973432","https://openalex.org/W1890431648","https://openalex.org/W1572861854","https://openalex.org/W2131711534"],"abstract_inverted_index":{"Speech":[0],"enhancement":[1,42],"is":[2],"a":[3,40,105,109,126,137,149],"task":[4],"to":[5,23,79,139,143],"improve":[6],"the":[7,65,131,140,160,164],"intelligibility":[8],"and":[9,33,101,107,153,170],"perceptual":[10],"quality":[11],"of":[12,60,70,134,151,156,168],"degraded":[13],"speech":[14,24,35,41,85,100,121,128],"signal.Recently,":[15],"neural":[16,27,76],"networks":[17,77],"based":[18,29],"methods":[19,30],"have":[20],"been":[21],"applied":[22],"enhancement.However,":[25],"many":[26],"network":[28,113],"require":[31],"noisy":[32,127],"clean":[34],"pairs":[36],"for":[37,120],"training.We":[38],"propose":[39],"framework":[43],"that":[44,83],"can":[45],"be":[46],"trained":[47,141],"with":[48,130],"large-scale":[49],"weakly":[50],"labelled":[51,54],"AudioSet":[52],"dataset.Weakly":[53],"data":[55],"only":[56],"contain":[57,84],"audio":[58,61,75,90],"tags":[59],"clips,":[62],"but":[63],"not":[64],"onset":[66],"or":[67,86],"offset":[68],"times":[69],"speech.We":[71],"first":[72],"apply":[73],"pretrained":[74],"(PANNs)":[78],"detect":[80],"anchor":[81,97],"segments":[82,98],"sound":[87,102],"events":[88,103],"in":[89],"clips.Then,":[91],"we":[92,124],"randomly":[93],"mix":[94],"two":[95],"detected":[96],"containing":[99],"as":[104,117,136],"mixture,":[106],"build":[108],"conditional":[110],"source":[111],"separation":[112],"using":[114],"PANNs":[115],"predictions":[116],"soft":[118],"conditions":[119],"enhancement.In":[122],"inference,":[123],"input":[125],"signal":[129],"one-hot":[132],"encoding":[133],"\"Speech\"":[135],"condition":[138],"system":[142,147,167],"predict":[144],"enhanced":[145],"speech.Our":[146],"achieves":[148],"PESQ":[150],"2.28":[152],"an":[154],"SSNR":[155],"8.75":[157],"dB":[158,172],"on":[159],"VoiceBank-DEMAND":[161],"dataset,":[162],"outperforming":[163],"previous":[165],"SEGAN":[166],"2.16":[169],"7.73":[171],"respectively.":[173]},"counts_by_year":[{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
