{"id":"https://openalex.org/W4416251836","doi":"https://doi.org/10.1109/waspaa66052.2025.11230940","title":"Generating Separated Singing Vocals Using a Diffusion Model Conditioned on Music Mixtures","display_name":"Generating Separated Singing Vocals Using a Diffusion Model Conditioned on Music Mixtures","publication_year":2025,"publication_date":"2025-10-12","ids":{"openalex":"https://openalex.org/W4416251836","doi":"https://doi.org/10.1109/waspaa66052.2025.11230940"},"language":null,"primary_location":{"id":"doi:10.1109/waspaa66052.2025.11230940","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230940","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027345230","display_name":"Gen\u00eds Plaja-Roglans","orcid":"https://orcid.org/0000-0003-3450-3194"},"institutions":[{"id":"https://openalex.org/I4210089335","display_name":"Perfect Harmony Health","ror":"https://ror.org/00835yh61","country_code":"US","type":"other","lineage":["https://openalex.org/I4210089335"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Gen\u00eds Plaja-Roglans","raw_affiliation_strings":["Music.AI,Salt Lake City,USA"],"affiliations":[{"raw_affiliation_string":"Music.AI,Salt Lake City,USA","institution_ids":["https://openalex.org/I4210089335"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005504900","display_name":"Yun-Ning Hung","orcid":"https://orcid.org/0000-0002-7242-6903"},"institutions":[{"id":"https://openalex.org/I4210089335","display_name":"Perfect Harmony Health","ror":"https://ror.org/00835yh61","country_code":"US","type":"other","lineage":["https://openalex.org/I4210089335"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yun-Ning Hung","raw_affiliation_strings":["Music.AI,Salt Lake City,USA"],"affiliations":[{"raw_affiliation_string":"Music.AI,Salt Lake City,USA","institution_ids":["https://openalex.org/I4210089335"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006479715","display_name":"Xavier Serra","orcid":"https://orcid.org/0000-0003-1395-2345"},"institutions":[{"id":"https://openalex.org/I170486558","display_name":"Pompeu Fabra University","ror":"https://ror.org/04n0g0b29","country_code":"ES","type":"education","lineage":["https://openalex.org/I170486558"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Xavier Serra","raw_affiliation_strings":["Universitat Pompeu Fabra,Music Technology Group,Spain"],"affiliations":[{"raw_affiliation_string":"Universitat Pompeu Fabra,Music Technology Group,Spain","institution_ids":["https://openalex.org/I170486558"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5018881892","display_name":"Igor Muzetti Pereira","orcid":"https://orcid.org/0000-0002-2715-9631"},"institutions":[{"id":"https://openalex.org/I4210089335","display_name":"Perfect Harmony Health","ror":"https://ror.org/00835yh61","country_code":"US","type":"other","lineage":["https://openalex.org/I4210089335"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Igor Pereira","raw_affiliation_strings":["Music.AI,Salt Lake City,USA"],"affiliations":[{"raw_affiliation_string":"Music.AI,Salt Lake City,USA","institution_ids":["https://openalex.org/I4210089335"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5027345230"],"corresponding_institution_ids":["https://openalex.org/I4210089335"],"apc_list":null,"apc_paid":null,"fwci":1.1366,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.84200251,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.28700000047683716,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.28700000047683716,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.2053000032901764,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.19939999282360077,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.6923999786376953},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.6757000088691711},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.6402000188827515},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.593999981880188},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5658000111579895},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.5113999843597412},{"id":"https://openalex.org/keywords/flexibility","display_name":"Flexibility (engineering)","score":0.46209999918937683},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4332999885082245},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.43130001425743103}],"concepts":[{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.6923999786376953},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.6757000088691711},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.6402000188827515},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.593999981880188},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5658000111579895},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5339000225067139},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5281000137329102},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.5113999843597412},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.46209999918937683},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4366999864578247},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4332999885082245},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.43130001425743103},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.42399999499320984},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4203000068664551},{"id":"https://openalex.org/C2776864781","wikidata":"https://www.wikidata.org/wiki/Q52617913","display_name":"Source separation","level":2,"score":0.3864000141620636},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.3831000030040741},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3666999936103821},{"id":"https://openalex.org/C68710425","wikidata":"https://www.wikidata.org/wiki/Q5275442","display_name":"Diffusion process","level":3,"score":0.36570000648498535},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3531000018119812},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3513999879360199},{"id":"https://openalex.org/C2983311337","wikidata":"https://www.wikidata.org/wiki/Q34379","display_name":"Musical instrument","level":2,"score":0.34220001101493835},{"id":"https://openalex.org/C2781100714","wikidata":"https://www.wikidata.org/wiki/Q377435","display_name":"Vibrato","level":3,"score":0.33149999380111694},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.3260999917984009},{"id":"https://openalex.org/C102519508","wikidata":"https://www.wikidata.org/wiki/Q6520159","display_name":"Fourier transform","level":2,"score":0.3089999854564667},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.29809999465942383},{"id":"https://openalex.org/C137800194","wikidata":"https://www.wikidata.org/wiki/Q11713455","display_name":"Interpolation (computer graphics)","level":3,"score":0.2962000072002411},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.289900004863739},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.2890999913215637},{"id":"https://openalex.org/C159694833","wikidata":"https://www.wikidata.org/wiki/Q2321565","display_name":"Iterative method","level":2,"score":0.2890999913215637},{"id":"https://openalex.org/C85841341","wikidata":"https://www.wikidata.org/wiki/Q1135984","display_name":"Octave (electronics)","level":2,"score":0.2808000147342682}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/waspaa66052.2025.11230940","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230940","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W2760103357","https://openalex.org/W2963992487","https://openalex.org/W3015268063","https://openalex.org/W3037149862","https://openalex.org/W3095999710","https://openalex.org/W4224297089","https://openalex.org/W4297841530","https://openalex.org/W4372260250","https://openalex.org/W4372341905","https://openalex.org/W4382203562","https://openalex.org/W4388979610","https://openalex.org/W4390873054","https://openalex.org/W4392903379","https://openalex.org/W4394916053","https://openalex.org/W4408352240","https://openalex.org/W4408355408"],"related_works":[],"abstract_inverted_index":{"Separating":[0],"the":[1,30,38,41,84,89,119,123,129,139,143,146],"individual":[2],"elements":[3],"in":[4],"a":[5,34,54,76],"musical":[6],"mixture":[7,35],"is":[8,19,80],"an":[9,135],"essential":[10],"process":[11],"for":[12,59],"music":[13,73],"analysis":[14],"and":[15,43,99,126,150],"practice.":[16],"While":[17],"this":[18,60,64],"generally":[20],"addressed":[21],"using":[22,75],"neural":[23],"networks":[24],"optimized":[25],"to":[26,36,53,82,121],"mask":[27],"or":[28],"transform":[29],"time-frequency":[31],"representation":[32],"of":[33,46,57,115,138,145],"extract":[37],"target":[39],"sources,":[40],"flexibility":[42],"generalization":[44],"capabilities":[45],"generative":[47,97],"diffusion":[48,77,116],"models":[49],"are":[50,152],"giving":[51],"rise":[52],"novel":[55],"class":[56],"solutions":[58],"complicated":[61],"task.":[62],"In":[63],"work,":[65],"we":[66],"explore":[67],"singing":[68],"voice":[69],"separation":[70],"from":[71],"real":[72],"recordings":[74],"model":[78],"which":[79],"trained":[81,108],"generate":[83],"solo":[85],"vocals":[86],"conditioned":[87],"on":[88],"corresponding":[90],"mixture.":[91],"Our":[92],"approach":[93],"improves":[94],"upon":[95],"prior":[96],"systems":[98],"achieves":[100],"competitive":[101],"objective":[102],"scores":[103],"against":[104],"non-generative":[105],"baselines":[106],"when":[107,131],"with":[109],"supplementary":[110],"data.":[111],"The":[112],"iterative":[113],"nature":[114],"sampling":[117,140],"enables":[118],"user":[120],"control":[122],"quality-efficiency":[124],"trade-off,":[125],"also":[127],"refine":[128],"output":[130],"needed.":[132],"We":[133],"present":[134],"ablation":[136],"study":[137],"algorithm,":[141],"highlighting":[142],"effects":[144],"user-configurable":[147],"parameters.":[148],"Code":[149],"weights":[151],"released.":[153]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-11-14T00:00:00"}
