{"id":"https://openalex.org/W7127052895","doi":"https://doi.org/10.1145/3784833.3784897","title":"Permutation-Free Training via Loudness Order for Interpretable Speech Separation","display_name":"Permutation-Free Training via Loudness Order for Interpretable Speech Separation","publication_year":2025,"publication_date":"2025-11-12","ids":{"openalex":"https://openalex.org/W7127052895","doi":"https://doi.org/10.1145/3784833.3784897"},"language":null,"primary_location":{"id":"doi:10.1145/3784833.3784897","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3784833.3784897","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 11th International Conference on Communication and Information Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3784833.3784897","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124705954","display_name":"Jiaying Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jiaying Wang","raw_affiliation_strings":["School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5124700333","display_name":"Li Guo","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Li Guo","raw_affiliation_strings":["School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China and Engineering Research Center of Blockchain and Network Convergence Technology, Ministry of Education, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, China and Engineering Research Center of Blockchain and Network Convergence Technology, Ministry of Education, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5124705954"],"corresponding_institution_ids":["https://openalex.org/I139759216"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.73245247,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"156","last_page":"160"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9799000024795532,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9799000024795532,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.01600000075995922,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11447","display_name":"Blind Source Separation Techniques","score":0.00039999998989515007,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/loudness","display_name":"Loudness","score":0.7609000205993652},{"id":"https://openalex.org/keywords/permutation","display_name":"Permutation (music)","score":0.6556000113487244},{"id":"https://openalex.org/keywords/ambiguity","display_name":"Ambiguity","score":0.5464000105857849},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.48420000076293945},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.37549999356269836},{"id":"https://openalex.org/keywords/channel","display_name":"Channel (broadcasting)","score":0.3750999867916107},{"id":"https://openalex.org/keywords/separation","display_name":"Separation (statistics)","score":0.359499990940094},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.3531999886035919}],"concepts":[{"id":"https://openalex.org/C79018884","wikidata":"https://www.wikidata.org/wiki/Q622324","display_name":"Loudness","level":2,"score":0.7609000205993652},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6786999702453613},{"id":"https://openalex.org/C21308566","wikidata":"https://www.wikidata.org/wiki/Q7169365","display_name":"Permutation (music)","level":2,"score":0.6556000113487244},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6035000085830688},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.5464000105857849},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.48420000076293945},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.40380001068115234},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.37549999356269836},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.3750999867916107},{"id":"https://openalex.org/C2776061190","wikidata":"https://www.wikidata.org/wiki/Q7451805","display_name":"Separation (statistics)","level":2,"score":0.359499990940094},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.3531999886035919},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.33500000834465027},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3156000077724457},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.30570000410079956},{"id":"https://openalex.org/C183763347","wikidata":"https://www.wikidata.org/wiki/Q120976","display_name":"Factorial","level":2,"score":0.2953000068664551},{"id":"https://openalex.org/C182306322","wikidata":"https://www.wikidata.org/wiki/Q1779371","display_name":"Order (exchange)","level":2,"score":0.29179999232292175},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.26910001039505005},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2624000012874603},{"id":"https://openalex.org/C84525736","wikidata":"https://www.wikidata.org/wiki/Q831366","display_name":"Decision tree","level":2,"score":0.26170000433921814},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.2547000050544739},{"id":"https://openalex.org/C2776848632","wikidata":"https://www.wikidata.org/wiki/Q853463","display_name":"Clipping (morphology)","level":2,"score":0.2540999948978424},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.25220000743865967}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3784833.3784897","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3784833.3784897","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 11th International Conference on Communication and Information Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3784833.3784897","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3784833.3784897","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 11th International Conference on Communication and Information Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.5004796385765076,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W1780344239","https://openalex.org/W2031647436","https://openalex.org/W2088361146","https://openalex.org/W2150415460","https://openalex.org/W2159202424","https://openalex.org/W2221409856","https://openalex.org/W2734774145","https://openalex.org/W2889503488","https://openalex.org/W2891405874","https://openalex.org/W2952218014","https://openalex.org/W2962866211","https://openalex.org/W3109079702"],"related_works":[],"abstract_inverted_index":{"Permutation":[0],"ambiguity":[1],"remains":[2],"a":[3,42,51,57],"fundamental":[4],"challenge":[5],"in":[6],"multi-speaker":[7],"speech":[8,47],"separation.":[9],"Although":[10],"permutation-invariant":[11],"training":[12,76],"(PIT)":[13],"alleviates":[14],"this":[15],"problem,":[16],"it":[17],"produces":[18],"output":[19],"channels":[20],"with":[21],"an":[22],"arbitrary,":[23],"non-interpretable":[24],"order":[25],"while":[26,98],"incurring":[27],"high":[28],"computational":[29],"cost.":[30],"This":[31],"paper":[32],"proposes":[33],"Permutation-Free":[34],"Training":[35],"(PFT)":[36],"via":[37],"Loudness":[38],"Order,":[39],"which":[40],"leverages":[41],"perceptual":[43],"prior":[44],"derived":[45],"from":[46,78],"loudness":[48],"to":[49,80,96],"establish":[50],"fixed":[52],"output\u2013reference":[53],"alignment.":[54],"By":[55],"enforcing":[56],"loudness-ordered":[58],"mapping":[59],"during":[60],"training,":[61],"PFT":[62,91],"eliminates":[63],"the":[64,84],"need":[65],"for":[66],"permutation":[67],"search,":[68],"achieves":[69],"perceptually":[70],"interpretable":[71],"channel":[72],"order,":[73],"and":[74],"reduces":[75],"complexity":[77],"factorial":[79],"linear.":[81],"Experiments":[82],"on":[83],"proposed":[85],"Loud-LibriMix":[86],"dataset":[87],"demonstrate":[88],"that":[89],"our":[90],"attains":[92],"separation":[93],"performance":[94],"comparable":[95],"PIT":[97],"substantially":[99],"improving":[100],"interpretability.":[101]},"counts_by_year":[],"updated_date":"2026-02-06T02:01:19.302388","created_date":"2026-02-03T00:00:00"}
