{"id":"https://openalex.org/W4416250740","doi":"https://doi.org/10.1109/ijcnn64981.2025.11228078","title":"Efficient and Fast Generative-Based Singing Voice Separation using a Latent Diffusion Model","display_name":"Efficient and Fast Generative-Based Singing Voice Separation using a Latent Diffusion Model","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4416250740","doi":"https://doi.org/10.1109/ijcnn64981.2025.11228078"},"language":null,"primary_location":{"id":"doi:10.1109/ijcnn64981.2025.11228078","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn64981.2025.11228078","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2511.20470","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027345230","display_name":"Gen\u00eds Plaja-Roglans","orcid":"https://orcid.org/0000-0003-3450-3194"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Gen\u00eds Plaja-Roglans","raw_affiliation_strings":["Music.AI,Salt Lake City,Utah,United States"],"affiliations":[{"raw_affiliation_string":"Music.AI,Salt Lake City,Utah,United States","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005504900","display_name":"Yun-Ning Hung","orcid":"https://orcid.org/0000-0002-7242-6903"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yun-Ning Hung","raw_affiliation_strings":["Music.AI,Salt Lake City,Utah,United States"],"affiliations":[{"raw_affiliation_string":"Music.AI,Salt Lake City,Utah,United States","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006479715","display_name":"Xavier Serra","orcid":"https://orcid.org/0000-0003-1395-2345"},"institutions":[{"id":"https://openalex.org/I170486558","display_name":"Pompeu Fabra University","ror":"https://ror.org/04n0g0b29","country_code":"ES","type":"education","lineage":["https://openalex.org/I170486558"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Xavier Serra","raw_affiliation_strings":["Universitat Pompeu Fabra,Music Technology Group,Barcelona,Spain"],"affiliations":[{"raw_affiliation_string":"Universitat Pompeu Fabra,Music Technology Group,Barcelona,Spain","institution_ids":["https://openalex.org/I170486558"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5018881892","display_name":"Igor Muzetti Pereira","orcid":"https://orcid.org/0000-0002-2715-9631"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Igor Pereira","raw_affiliation_strings":["Music.AI,Salt Lake City,Utah,United States"],"affiliations":[{"raw_affiliation_string":"Music.AI,Salt Lake City,Utah,United States","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5027345230"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.45247468,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.35920000076293945,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.35920000076293945,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.15839999914169312,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.11959999799728394,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.6794999837875366},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.6166999936103821},{"id":"https://openalex.org/keywords/source-separation","display_name":"Source separation","score":0.6137999892234802},{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.5566999912261963},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.544700026512146},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4722999930381775},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.42100000381469727},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.38350000977516174},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.37720000743865967}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.6794999837875366},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6740000247955322},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.6166999936103821},{"id":"https://openalex.org/C2776864781","wikidata":"https://www.wikidata.org/wiki/Q52617913","display_name":"Source separation","level":2,"score":0.6137999892234802},{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.5566999912261963},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.544700026512146},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.51910001039505},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5012000203132629},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4722999930381775},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.42100000381469727},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.38350000977516174},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.37720000743865967},{"id":"https://openalex.org/C2776061190","wikidata":"https://www.wikidata.org/wiki/Q7451805","display_name":"Separation (statistics)","level":2,"score":0.3628000020980835},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.35420000553131104},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3384999930858612},{"id":"https://openalex.org/C2778858076","wikidata":"https://www.wikidata.org/wiki/Q5249539","display_name":"Decodes","level":3,"score":0.3328000009059906},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.31790000200271606},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.28220000863075256},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.2752000093460083},{"id":"https://openalex.org/C120317606","wikidata":"https://www.wikidata.org/wiki/Q17105967","display_name":"Blind signal separation","level":3,"score":0.2727000117301941},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2720000147819519},{"id":"https://openalex.org/C173853756","wikidata":"https://www.wikidata.org/wiki/Q86915","display_name":"Dialog box","level":2,"score":0.26570001244544983},{"id":"https://openalex.org/C51167844","wikidata":"https://www.wikidata.org/wiki/Q4422623","display_name":"Latent variable","level":2,"score":0.26420000195503235},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.25600001215934753},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.25429999828338623}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/ijcnn64981.2025.11228078","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn64981.2025.11228078","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2511.20470","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.20470","pdf_url":"https://arxiv.org/pdf/2511.20470","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2511.20470","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.20470","pdf_url":"https://arxiv.org/pdf/2511.20470","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W1552314771","https://openalex.org/W2127851351","https://openalex.org/W2131738223","https://openalex.org/W2565807767","https://openalex.org/W2791686384","https://openalex.org/W2963992487","https://openalex.org/W2998490864","https://openalex.org/W3015268063","https://openalex.org/W3095999710","https://openalex.org/W3197334236","https://openalex.org/W3215615641","https://openalex.org/W4224297089","https://openalex.org/W4225261970","https://openalex.org/W4312933868","https://openalex.org/W4372260250","https://openalex.org/W4372341629","https://openalex.org/W4372341905","https://openalex.org/W4375928773","https://openalex.org/W4390873054","https://openalex.org/W4392903379","https://openalex.org/W4394916053","https://openalex.org/W4402905495","https://openalex.org/W4408352240","https://openalex.org/W4408355053","https://openalex.org/W4408355408"],"related_works":[],"abstract_inverted_index":{"Extracting":[0],"individual":[1,27],"elements":[2],"from":[3],"music":[4,11,40],"mixtures":[5,111],"is":[6,53,148],"a":[7,67,129,167,179,197],"valuable":[8],"tool":[9],"for":[10,112,192,200],"production":[12],"and":[13,37,75,110,133,143,160,173],"practice.":[14],"While":[15],"neural":[16],"networks":[17],"optimized":[18],"to":[19,55,62,90],"mask":[20],"or":[21],"transform":[22],"mixture":[23,52],"spectrograms":[24],"into":[25,137],"the":[26,31,34,51,72,85,123,162,184,193,204],"source(s)":[28],"have":[29],"been":[30],"leading":[32],"approach,":[33],"source":[35],"overlap":[36],"correlation":[38],"in":[39,50,66,128],"signals":[41],"poses":[42],"an":[43],"inherent":[44],"challenge.":[45],"Also,":[46],"accessing":[47],"all":[48],"sources":[49],"crucial":[54],"train":[56],"these":[57,64,136],"systems,":[58,159],"while":[59],"complicated.":[60],"Attempts":[61],"address":[63],"challenges":[65],"generative":[68,98,157],"fashion":[69],"exist,":[70],"however,":[71],"separation":[73,101,158],"performance":[74],"inference":[76],"efficiency":[77],"remain":[78],"limited.":[79],"In":[80],"this":[81,94],"work,":[82],"we":[83,119],"study":[84,182],"potential":[86,191],"of":[87,107,169],"diffusion":[88],"models":[89],"advance":[91],"toward":[92],"bridging":[93],"gap,":[95],"focusing":[96],"on":[97,104,166,174,183,189,203],"singing":[99],"voice":[100],"relying":[102],"only":[103,151],"corresponding":[105],"pairs":[106],"isolated":[108],"vocals":[109],"training.":[113],"To":[114],"align":[115],"with":[116],"creative":[117],"workflows,":[118],"leverage":[120],"latent":[121,131,185],"diffusion:":[122],"system":[124,147],"generates":[125],"samples":[126],"encoded":[127],"compact":[130],"space,":[132],"subsequently":[134],"decodes":[135],"audio.":[138],"This":[139],"enables":[140],"efficient":[141],"optimization":[142],"faster":[144],"inference.":[145],"Our":[146],"trained":[149],"using":[150],"open":[152],"data.":[153],"We":[154,177,195],"outperform":[155],"existing":[156],"level":[161],"compared":[163],"non-generative":[164],"systems":[165],"list":[168],"signal":[170],"quality":[171],"measures":[172],"interference":[175],"removal.":[176],"provide":[178],"noise":[180],"robustness":[181],"encoder,":[186],"providing":[187],"insights":[188],"its":[190],"task.":[194],"release":[196],"modular":[198],"toolkit":[199],"further":[201],"research":[202],"topic.<sup":[205],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[206],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[207]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-14T00:00:00"}
