{"id":"https://openalex.org/W4414946190","doi":"https://doi.org/10.1109/waspaa66052.2025.11230934","title":"Towards Reliable Objective Evaluation Metrics for Generative Singing Voice Separation Models","display_name":"Towards Reliable Objective Evaluation Metrics for Generative Singing Voice Separation Models","publication_year":2025,"publication_date":"2025-10-12","ids":{"openalex":"https://openalex.org/W4414946190","doi":"https://doi.org/10.1109/waspaa66052.2025.11230934"},"language":"en","primary_location":{"id":"doi:10.1109/waspaa66052.2025.11230934","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230934","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2507.11427","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5002871490","display_name":"Paul A. Bereuter","orcid":null},"institutions":[{"id":"https://openalex.org/I317320687","display_name":"University of Music and Performing Arts Graz","ror":"https://ror.org/0541v4g57","country_code":"AT","type":"education","lineage":["https://openalex.org/I317320687"]}],"countries":["AT"],"is_corresponding":true,"raw_author_name":"Paul A. Bereuter","raw_affiliation_strings":["University of Music and Performing Arts,Institute of Electronic Music and Acoustics (IEM),Graz,Austria"],"affiliations":[{"raw_affiliation_string":"University of Music and Performing Arts,Institute of Electronic Music and Acoustics (IEM),Graz,Austria","institution_ids":["https://openalex.org/I317320687"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065188041","display_name":"Benjamin E. Stahl","orcid":"https://orcid.org/0000-0002-3169-3167"},"institutions":[{"id":"https://openalex.org/I317320687","display_name":"University of Music and Performing Arts Graz","ror":"https://ror.org/0541v4g57","country_code":"AT","type":"education","lineage":["https://openalex.org/I317320687"]}],"countries":["AT"],"is_corresponding":false,"raw_author_name":"Benjamin Stahl","raw_affiliation_strings":["University of Music and Performing Arts,Institute of Electronic Music and Acoustics (IEM),Graz,Austria"],"affiliations":[{"raw_affiliation_string":"University of Music and Performing Arts,Institute of Electronic Music and Acoustics (IEM),Graz,Austria","institution_ids":["https://openalex.org/I317320687"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066967599","display_name":"Mark D. Plumbley","orcid":"https://orcid.org/0000-0002-9708-1075"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Mark D. Plumbley","raw_affiliation_strings":["University of Surrey,Centre for Vision, Speech and Signal Processing (CVSSP),UK"],"affiliations":[{"raw_affiliation_string":"University of Surrey,Centre for Vision, Speech and Signal Processing (CVSSP),UK","institution_ids":["https://openalex.org/I28290843"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5087260049","display_name":"Alois Sontacchi","orcid":"https://orcid.org/0009-0008-9205-209X"},"institutions":[{"id":"https://openalex.org/I317320687","display_name":"University of Music and Performing Arts Graz","ror":"https://ror.org/0541v4g57","country_code":"AT","type":"education","lineage":["https://openalex.org/I317320687"]}],"countries":["AT"],"is_corresponding":false,"raw_author_name":"Alois Sontacchi","raw_affiliation_strings":["University of Music and Performing Arts,Institute of Electronic Music and Acoustics (IEM),Graz,Austria"],"affiliations":[{"raw_affiliation_string":"University of Music and Performing Arts,Institute of Electronic Music and Acoustics (IEM),Graz,Austria","institution_ids":["https://openalex.org/I317320687"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5002871490"],"corresponding_institution_ids":["https://openalex.org/I317320687"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.14225741,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9955999851226807,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.8436999917030334},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.6366000175476074},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.48820000886917114},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.459199994802475},{"id":"https://openalex.org/keywords/correlation","display_name":"Correlation","score":0.4472000002861023},{"id":"https://openalex.org/keywords/separation","display_name":"Separation (statistics)","score":0.436599999666214},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4359000027179718},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.43320000171661377},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.42879998683929443},{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.41929998993873596}],"concepts":[{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.8436999917030334},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.6366000175476074},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6115999817848206},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5669000148773193},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5339999794960022},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.48820000886917114},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.46230000257492065},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.459199994802475},{"id":"https://openalex.org/C117220453","wikidata":"https://www.wikidata.org/wiki/Q5172842","display_name":"Correlation","level":2,"score":0.4472000002861023},{"id":"https://openalex.org/C2776061190","wikidata":"https://www.wikidata.org/wiki/Q7451805","display_name":"Separation (statistics)","level":2,"score":0.436599999666214},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4359000027179718},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.43320000171661377},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.42879998683929443},{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.41929998993873596},{"id":"https://openalex.org/C169903167","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Test set","level":2,"score":0.4154999852180481},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4065000116825104},{"id":"https://openalex.org/C12267149","wikidata":"https://www.wikidata.org/wiki/Q282453","display_name":"Support vector machine","level":2,"score":0.364300012588501},{"id":"https://openalex.org/C2776864781","wikidata":"https://www.wikidata.org/wiki/Q52617913","display_name":"Source separation","level":2,"score":0.36230000853538513},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.3612000048160553},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.34610000252723694},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.32659998536109924},{"id":"https://openalex.org/C153874254","wikidata":"https://www.wikidata.org/wiki/Q115542","display_name":"Canonical correlation","level":2,"score":0.3231000006198883},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.3037000000476837},{"id":"https://openalex.org/C177291462","wikidata":"https://www.wikidata.org/wiki/Q423038","display_name":"Active listening","level":2,"score":0.3012000024318695},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.29600000381469727},{"id":"https://openalex.org/C163175372","wikidata":"https://www.wikidata.org/wiki/Q3339222","display_name":"Linear model","level":2,"score":0.29100000858306885},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.2888999879360199},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.27869999408721924},{"id":"https://openalex.org/C90697598","wikidata":"https://www.wikidata.org/wiki/Q3657183","display_name":"Objective test","level":3,"score":0.2727000117301941},{"id":"https://openalex.org/C120317606","wikidata":"https://www.wikidata.org/wiki/Q17105967","display_name":"Blind signal separation","level":3,"score":0.2676999866962433},{"id":"https://openalex.org/C158622935","wikidata":"https://www.wikidata.org/wiki/Q660848","display_name":"Nonlinear system","level":2,"score":0.262800008058548},{"id":"https://openalex.org/C45804977","wikidata":"https://www.wikidata.org/wiki/Q7239673","display_name":"Predictive modelling","level":2,"score":0.2606000006198883},{"id":"https://openalex.org/C62897895","wikidata":"https://www.wikidata.org/wiki/Q1915482","display_name":"Mean opinion score","level":3,"score":0.2574000060558319},{"id":"https://openalex.org/C69738355","wikidata":"https://www.wikidata.org/wiki/Q1228929","display_name":"Linear discriminant analysis","level":2,"score":0.2549999952316284}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/waspaa66052.2025.11230934","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230934","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2507.11427","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2507.11427","pdf_url":"https://arxiv.org/pdf/2507.11427","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2507.11427","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2507.11427","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2507.11427","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2507.11427","pdf_url":"https://arxiv.org/pdf/2507.11427","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W165956390","https://openalex.org/W2963992487","https://openalex.org/W4394916053","https://openalex.org/W2127851351","https://openalex.org/W4380434618","https://openalex.org/W2930648092","https://openalex.org/W4408352240","https://openalex.org/W4323064860","https://openalex.org/W2963204961","https://openalex.org/W3015338123","https://openalex.org/W4372260310","https://openalex.org/W2972478942","https://openalex.org/W4392902957","https://openalex.org/W4386764924","https://openalex.org/W4408353835","https://openalex.org/W4402112400","https://openalex.org/W4372260250","https://openalex.org/W3199957557","https://openalex.org/W4392903379","https://openalex.org/W4402112435","https://openalex.org/W3025844872","https://openalex.org/W2791686384","https://openalex.org/W4210984894","https://openalex.org/W3207860153","https://openalex.org/W3037038648","https://openalex.org/W2078483536"],"related_works":[],"abstract_inverted_index":{"Traditional":[0],"Blind":[1],"Source":[2],"Separation":[3],"Evaluation":[4],"(BSS-Eval)":[5],"metrics":[6,43,77,105,114,180,200],"were":[7],"originally":[8],"designed":[9],"to":[10,137],"evaluate":[11,86],"linear":[12],"audio":[13,75],"source":[14],"separation":[15,186],"models":[16,27,90,187],"based":[17],"on":[18,131,157],"methods":[19],"such":[20,115],"as":[21,116],"time-frequency":[22],"masking.":[23],"However,":[24],"recent":[25],"generative":[26,95,101,141,183],"may":[28],"introduce":[29],"nonlinear":[30],"relationships":[31],"between":[32,62],"the":[33,39,63,79,122,128,138,143,149,154,161,165,176,190,202],"separated":[34],"and":[35,59,70,91,100,153,188,195],"reference":[36],"signals,":[37],"limiting":[38],"reliability":[40],"of":[41,73,81,140,178,197,204],"these":[42],"for":[44,78,148,181,192,201],"objective":[45,74],"evaluation.":[46],"To":[47],"address":[48],"this":[49],"issue,":[50],"we":[51],"conduct":[52],"a":[53,71],"Degradation":[54],"Category":[55],"Rating":[56],"listening":[57],"test":[58],"analyze":[60],"correlations":[61,108,145],"obtained":[64],"degradation":[65],"mean":[66],"opinion":[67],"scores":[68],"(DMOS)":[69],"set":[72],"quality":[76],"task":[80,203],"singing":[82,184,205],"voice":[83,185,206],"separation.":[84,207],"We":[85],"three":[87],"state-of-the-art":[88],"discriminative":[89,99,120],"two":[92],"new,":[93],"competitive":[94],"models.":[96],"For":[97,119],"both":[98,170],"models,":[102,121,142],"intrusive":[103,113],"embedding-based":[104],"show":[106],"higher":[107],"with":[109,160],"DMOS":[110],"than":[111],"conventional":[112],"BSS-Eval":[117,179],"metrics.":[118],"highest":[123],"correlation":[124,168],"is":[125],"achieved":[126],"by":[127],"MSE":[129,155],"computed":[130],"Music2Latent":[132],"embeddings.":[133],"When":[134],"it":[135],"comes":[136],"evaluation":[139,199],"strongest":[144],"are":[146],"evident":[147],"multi-resolution":[150],"STFT":[151],"loss":[152],"calculated":[156],"MERT-L12":[158],"embeddings,":[159],"latter":[162],"also":[163],"providing":[164],"most":[166],"balanced":[167],"across":[169],"model":[171],"types.":[172],"Our":[173],"results":[174],"highlight":[175],"limitations":[177],"evaluating":[182],"emphasize":[189],"need":[191],"careful":[193],"selection":[194],"validation":[196],"alternative":[198]},"counts_by_year":[],"updated_date":"2026-04-15T08:11:43.952461","created_date":"2025-10-08T00:00:00"}
