{"id":"https://openalex.org/W4372260037","doi":"https://doi.org/10.1109/icassp49357.2023.10095761","title":"Unsupervised Vocal Dereverberation with Diffusion-Based Generative Models","display_name":"Unsupervised Vocal Dereverberation with Diffusion-Based Generative Models","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372260037","doi":"https://doi.org/10.1109/icassp49357.2023.10095761"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10095761","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095761","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5071428569","display_name":"Koichi Saito","orcid":"https://orcid.org/0000-0001-6497-5286"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Koichi Saito","raw_affiliation_strings":["Sony Group Corporation,Tokyo,Japan","Sony Group Corporation, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Sony Group Corporation,Tokyo,Japan","institution_ids":[]},{"raw_affiliation_string":"Sony Group Corporation, Tokyo, Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031225419","display_name":"Naoki Murata","orcid":"https://orcid.org/0000-0001-7418-5173"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Naoki Murata","raw_affiliation_strings":["Sony Group Corporation,Tokyo,Japan","Sony Group Corporation, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Sony Group Corporation,Tokyo,Japan","institution_ids":[]},{"raw_affiliation_string":"Sony Group Corporation, Tokyo, Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034556374","display_name":"Toshimitsu Uesaka","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Toshimitsu Uesaka","raw_affiliation_strings":["Sony Group Corporation,Tokyo,Japan","Sony Group Corporation, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Sony Group Corporation,Tokyo,Japan","institution_ids":[]},{"raw_affiliation_string":"Sony Group Corporation, Tokyo, Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044626752","display_name":"Chieh-Hsin Lai","orcid":"https://orcid.org/0009-0009-3059-929X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chieh-Hsin Lai","raw_affiliation_strings":["Sony Group Corporation,Tokyo,Japan","Sony Group Corporation, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Sony Group Corporation,Tokyo,Japan","institution_ids":[]},{"raw_affiliation_string":"Sony Group Corporation, Tokyo, Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078773088","display_name":"Yuhta Takida","orcid":"https://orcid.org/0000-0001-7384-0842"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuhta Takida","raw_affiliation_strings":["Sony Group Corporation,Tokyo,Japan","Sony Group Corporation, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Sony Group Corporation,Tokyo,Japan","institution_ids":[]},{"raw_affiliation_string":"Sony Group Corporation, Tokyo, Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101696966","display_name":"Takao Fukui","orcid":"https://orcid.org/0000-0001-5990-642X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Takao Fukui","raw_affiliation_strings":["Sony Group Corporation,Tokyo,Japan","Sony Group Corporation, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Sony Group Corporation,Tokyo,Japan","institution_ids":[]},{"raw_affiliation_string":"Sony Group Corporation, Tokyo, Japan","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5088754502","display_name":"Yuki Mitsufuji","orcid":"https://orcid.org/0000-0002-6806-6140"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuki Mitsufuji","raw_affiliation_strings":["Sony Group Corporation,Tokyo,Japan","Sony Group Corporation, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Sony Group Corporation,Tokyo,Japan","institution_ids":[]},{"raw_affiliation_string":"Sony Group Corporation, Tokyo, Japan","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5071428569"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.0377,"has_fulltext":false,"cited_by_count":11,"citation_normalized_percentile":{"value":0.87330203,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10822","display_name":"Acoustic Wave Phenomena Research","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/2204","display_name":"Biomedical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reverberation","display_name":"Reverberation","score":0.93266361951828},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7613599300384521},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6023951172828674},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.4708014726638794},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4367208182811737},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.42502737045288086},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.35191965103149414},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.3312046527862549}],"concepts":[{"id":"https://openalex.org/C95851461","wikidata":"https://www.wikidata.org/wiki/Q468809","display_name":"Reverberation","level":2,"score":0.93266361951828},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7613599300384521},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6023951172828674},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.4708014726638794},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4367208182811737},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.42502737045288086},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.35191965103149414},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.3312046527862549},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10095761","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095761","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W1572891627","https://openalex.org/W1901129140","https://openalex.org/W1975710323","https://openalex.org/W2002507711","https://openalex.org/W2005216556","https://openalex.org/W2067709094","https://openalex.org/W2076456512","https://openalex.org/W2100522270","https://openalex.org/W2161866312","https://openalex.org/W2164502538","https://openalex.org/W2167316218","https://openalex.org/W2296153915","https://openalex.org/W2507109542","https://openalex.org/W2526050071","https://openalex.org/W2787898208","https://openalex.org/W2791686384","https://openalex.org/W2964013315","https://openalex.org/W2972478942","https://openalex.org/W3001643781","https://openalex.org/W3163735648","https://openalex.org/W3181854487","https://openalex.org/W4224932134","https://openalex.org/W4281872541","https://openalex.org/W4312293341","https://openalex.org/W6634311145","https://openalex.org/W6779823529","https://openalex.org/W6784335252","https://openalex.org/W6786375611","https://openalex.org/W6788570256","https://openalex.org/W6796086097","https://openalex.org/W6803646683","https://openalex.org/W6809940947","https://openalex.org/W6839052650"],"related_works":["https://openalex.org/W4365211920","https://openalex.org/W3014948380","https://openalex.org/W4380551139","https://openalex.org/W4317695495","https://openalex.org/W4395044357","https://openalex.org/W4287117424","https://openalex.org/W4387506531","https://openalex.org/W2087346071","https://openalex.org/W2967848559","https://openalex.org/W4299831724"],"abstract_inverted_index":{"Removing":[0],"reverb":[1,29,36,97],"from":[2],"reverberant":[3,63],"music":[4,15,19,99],"is":[5,110],"a":[6,31,92,123],"necessary":[7],"technique":[8,127],"to":[9,38,72,75],"clean":[10],"up":[11],"audio":[12],"for":[13,68,98,105],"downstream":[14],"manipulations.":[16],"Reverberation":[17],"of":[18,62,95,103,136],"contains":[20],"two":[21],"categories,":[22],"natural":[23,35],"reverb,":[24],"and":[25,43,59,65,128,143],"artificial":[26,96],"reverb.":[27],"Artificial":[28],"has":[30],"wider":[32],"diversity":[33],"than":[34],"due":[37],"its":[39],"various":[40],"parameter":[41],"setups":[42],"reverberation":[44,120],"types.":[45],"However,":[46],"recent":[47],"supervised":[48],"dereverberation":[49,154],"methods":[50],"may":[51],"fail":[52],"because":[53],"they":[54],"rely":[55],"on":[56,112],"sufficiently":[57],"diverse":[58],"numerous":[60],"pairs":[61,102],"observations":[64,77],"retrieved":[66],"data":[67,104],"training":[69],"in":[70],"order":[71],"be":[73],"generalizable":[74],"unseen":[76],"during":[78],"inference.":[79],"To":[80],"resolve":[81],"these":[82],"problems,":[83],"we":[84],"propose":[85],"an":[86],"unsupervised":[87],"method":[88,109,148],"that":[89,146],"can":[90],"remove":[91],"general":[93],"kind":[94],"without":[100],"requiring":[101],"training.":[106],"The":[107],"proposed":[108],"based":[111],"diffusion":[113,137],"models,":[114],"where":[115],"it":[116],"initializes":[117],"the":[118,131,134,150],"unknown":[119],"operator":[121],"with":[122,133],"conventional":[124],"signal":[125],"processing":[126],"simultaneously":[129],"refines":[130],"estimate":[132],"help":[135],"models.":[138],"We":[139],"show":[140],"through":[141],"objective":[142],"perceptual":[144],"evaluations":[145],"our":[147],"outperforms":[149],"current":[151],"leading":[152],"vocal":[153],"benchmarks.":[155]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
