{"id":"https://openalex.org/W4416250427","doi":"https://doi.org/10.1109/waspaa66052.2025.11231018","title":"Is MixIT Really Unsuitable for Correlated Sources? Exploring MixIT for Unsupervised Pre-training in Music Source Separation","display_name":"Is MixIT Really Unsuitable for Correlated Sources? Exploring MixIT for Unsupervised Pre-training in Music Source Separation","publication_year":2025,"publication_date":"2025-10-12","ids":{"openalex":"https://openalex.org/W4416250427","doi":"https://doi.org/10.1109/waspaa66052.2025.11231018"},"language":null,"primary_location":{"id":"doi:10.1109/waspaa66052.2025.11231018","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11231018","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013385059","display_name":"Kohei Saijo","orcid":null},"institutions":[{"id":"https://openalex.org/I73613424","display_name":"National Institute of Advanced Industrial Science and Technology","ror":"https://ror.org/01703db54","country_code":"JP","type":"government","lineage":["https://openalex.org/I73613424"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Kohei Saijo","raw_affiliation_strings":["National Institute of Advanced Industrial Science and Technology (AIST),Tokyo,Japan"],"affiliations":[{"raw_affiliation_string":"National Institute of Advanced Industrial Science and Technology (AIST),Tokyo,Japan","institution_ids":["https://openalex.org/I73613424"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5007353694","display_name":"Yoshiaki Bando","orcid":"https://orcid.org/0000-0002-3934-0745"},"institutions":[{"id":"https://openalex.org/I73613424","display_name":"National Institute of Advanced Industrial Science and Technology","ror":"https://ror.org/01703db54","country_code":"JP","type":"government","lineage":["https://openalex.org/I73613424"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yoshiaki Bando","raw_affiliation_strings":["National Institute of Advanced Industrial Science and Technology (AIST),Tokyo,Japan"],"affiliations":[{"raw_affiliation_string":"National Institute of Advanced Industrial Science and Technology (AIST),Tokyo,Japan","institution_ids":["https://openalex.org/I73613424"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5013385059"],"corresponding_institution_ids":["https://openalex.org/I73613424"],"apc_list":null,"apc_paid":null,"fwci":1.6611,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.88332546,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9372000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9372000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.04019999876618385,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.006000000052154064,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/source-separation","display_name":"Source separation","score":0.5916000008583069},{"id":"https://openalex.org/keywords/unsupervised-learning","display_name":"Unsupervised learning","score":0.5339000225067139},{"id":"https://openalex.org/keywords/data-source","display_name":"Data source","score":0.4032000005245209},{"id":"https://openalex.org/keywords/information-source","display_name":"Information source (mathematics)","score":0.3549000024795532},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.334199994802475},{"id":"https://openalex.org/keywords/blind-signal-separation","display_name":"Blind signal separation","score":0.32850000262260437}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6075000166893005},{"id":"https://openalex.org/C2776864781","wikidata":"https://www.wikidata.org/wiki/Q52617913","display_name":"Source separation","level":2,"score":0.5916000008583069},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5745999813079834},{"id":"https://openalex.org/C8038995","wikidata":"https://www.wikidata.org/wiki/Q1152135","display_name":"Unsupervised learning","level":2,"score":0.5339000225067139},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.42899999022483826},{"id":"https://openalex.org/C2983685735","wikidata":"https://www.wikidata.org/wiki/Q5227355","display_name":"Data source","level":2,"score":0.4032000005245209},{"id":"https://openalex.org/C2778095710","wikidata":"https://www.wikidata.org/wiki/Q6031225","display_name":"Information source (mathematics)","level":2,"score":0.3549000024795532},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.334199994802475},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3314000070095062},{"id":"https://openalex.org/C120317606","wikidata":"https://www.wikidata.org/wiki/Q17105967","display_name":"Blind signal separation","level":3,"score":0.32850000262260437},{"id":"https://openalex.org/C2776061190","wikidata":"https://www.wikidata.org/wiki/Q7451805","display_name":"Separation (statistics)","level":2,"score":0.32580000162124634},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.29760000109672546},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.28040000796318054},{"id":"https://openalex.org/C171686336","wikidata":"https://www.wikidata.org/wiki/Q3532085","display_name":"Topic model","level":2,"score":0.26269999146461487},{"id":"https://openalex.org/C2985998994","wikidata":"https://www.wikidata.org/wiki/Q3644502","display_name":"Source model","level":2,"score":0.258899986743927},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2502000033855438}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/waspaa66052.2025.11231018","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11231018","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W1976069042","https://openalex.org/W2022668263","https://openalex.org/W2963992487","https://openalex.org/W2972411915","https://openalex.org/W3037149862","https://openalex.org/W3160733670","https://openalex.org/W3161934504","https://openalex.org/W3168140565","https://openalex.org/W3197103473","https://openalex.org/W3199957557","https://openalex.org/W3209059054","https://openalex.org/W4206634335","https://openalex.org/W4221160586","https://openalex.org/W4225326921","https://openalex.org/W4292969786","https://openalex.org/W4323064860","https://openalex.org/W4372271362","https://openalex.org/W4375928773","https://openalex.org/W4385822289","https://openalex.org/W4392903379","https://openalex.org/W4392931574","https://openalex.org/W4403126475","https://openalex.org/W4408347314","https://openalex.org/W4415433380"],"related_works":[],"abstract_inverted_index":{"In":[0],"music":[1],"source":[2,49,100],"separation":[3],"(MSS),":[4],"obtaining":[5],"isolated":[6],"sources":[7],"or":[8,83],"stems":[9],"is":[10],"highly":[11],"costly,":[12],"making":[13],"pre-training":[14,134,175],"on":[15,142,157],"unlabeled":[16,144],"data":[17,145],"a":[18,140],"promising":[19],"approach.":[20],"Although":[21],"source-agnostic":[22],"unsupervised":[23,124],"learning":[24],"like":[25],"mixture-invariant":[26],"training":[27,180],"(MixIT)":[28],"has":[29],"been":[30,38],"explored":[31],"in":[32,41],"general":[33],"sound":[34],"separation,":[35],"they":[36],"have":[37],"largely":[39],"overlooked":[40],"MSS":[42,61,68,169],"due":[43],"to":[44,60,117],"its":[45,121],"implicit":[46],"assumption":[47],"of":[48,57,67,80,166],"independence.":[50],"We":[51,137],"hypothesize,":[52],"however,":[53],"that":[54,111,173],"the":[55,64,147,162,167,177],"difficulty":[56],"applying":[58],"MixIT":[59,95],"arises":[62],"from":[63,90,146,181],"ill-posed":[65],"nature":[66],"itself,":[69],"where":[70],"stem":[71],"definitions":[72],"are":[73],"application-dependent":[74],"and":[75,102,153],"models":[76],"lack":[77],"explicit":[78],"knowledge":[79],"what":[81],"should":[82,84],"not":[85,97],"be":[86],"separated,":[87],"rather":[88],"than":[89],"high":[91],"inter-source":[92],"correlation.":[93],"While":[94],"does":[96],"assume":[98],"any":[99],"model":[101,141],"struggles":[103],"with":[104,159],"such":[105],"ambiguities,":[106],"our":[107],"preliminary":[108],"experiments":[109],"show":[110],"it":[112,156],"can":[113],"still":[114],"separate":[115],"instruments":[116],"some":[118],"extent,":[119],"suggesting":[120],"potential":[122],"for":[123,135],"pre-training.":[125],"Motivated":[126],"by":[127],"these":[128],"insights,":[129],"this":[130],"study":[131],"investigates":[132],"MixIT-based":[133,174],"MSS.":[136],"first":[138],"pre-train":[139],"in-the-wild,":[143],"Free":[148],"Music":[149],"Archive":[150],"using":[151],"MixIT,":[152],"then":[154],"fine-tune":[155],"MUSDB18":[158],"supervision.":[160],"Using":[161],"band-split":[163],"TF-Locoformer,":[164],"one":[165],"state-of-the-art":[168],"models,":[170],"we":[171],"demonstrate":[172],"improves":[176],"performance":[178],"over":[179],"scratch.":[182]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-14T00:00:00"}
