{"id":"https://openalex.org/W4416251799","doi":"https://doi.org/10.1109/waspaa66052.2025.11231013","title":"Hybrid-Sep: Language-queried audio source separation via pre-trained Model Fusion and Adversarial Consistent Training","display_name":"Hybrid-Sep: Language-queried audio source separation via pre-trained Model Fusion and Adversarial Consistent Training","publication_year":2025,"publication_date":"2025-10-12","ids":{"openalex":"https://openalex.org/W4416251799","doi":"https://doi.org/10.1109/waspaa66052.2025.11231013"},"language":null,"primary_location":{"id":"doi:10.1109/waspaa66052.2025.11231013","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11231013","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101092883","display_name":"Jianyuan Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jianyuan Feng","raw_affiliation_strings":["ByteDance,China"],"affiliations":[{"raw_affiliation_string":"ByteDance,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020706744","display_name":"Guo Li","orcid":"https://orcid.org/0000-0002-7127-1102"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guangzheng Li","raw_affiliation_strings":["ByteDance,China"],"affiliations":[{"raw_affiliation_string":"ByteDance,China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5064110432","display_name":"Yangfei Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yangfei Xu","raw_affiliation_strings":["ByteDance,China"],"affiliations":[{"raw_affiliation_string":"ByteDance,China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5101092883"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.61,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.88079332,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.8183000087738037,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.8183000087738037,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.10419999808073044,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.039500001817941666,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.8027999997138977},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.5976999998092651},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.534500002861023},{"id":"https://openalex.org/keywords/source-separation","display_name":"Source separation","score":0.5242999792098999},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4821000099182129},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.4023999869823456},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.36730000376701355},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.3246000111103058}],"concepts":[{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.8027999997138977},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7724999785423279},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.5976999998092651},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5630999803543091},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.534500002861023},{"id":"https://openalex.org/C2776864781","wikidata":"https://www.wikidata.org/wiki/Q52617913","display_name":"Source separation","level":2,"score":0.5242999792098999},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4821000099182129},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4636000096797943},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4162999987602234},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.4023999869823456},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.39399999380111694},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.36730000376701355},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3246000111103058},{"id":"https://openalex.org/C2776061190","wikidata":"https://www.wikidata.org/wiki/Q7451805","display_name":"Separation (statistics)","level":2,"score":0.3246000111103058},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.32350000739097595},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.3224000036716461},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.2734000086784363},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.27059999108314514},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.26109999418258667},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.2572999894618988},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.2540000081062317},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.2522999942302704}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/waspaa66052.2025.11231013","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11231013","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W1672116358","https://openalex.org/W2593116425","https://openalex.org/W2998490864","https://openalex.org/W3209059054","https://openalex.org/W4205689591","https://openalex.org/W4372260310","https://openalex.org/W4392908343","https://openalex.org/W4400033239","https://openalex.org/W4403780823","https://openalex.org/W4404037229","https://openalex.org/W4405934446","https://openalex.org/W4408352385","https://openalex.org/W4408352941","https://openalex.org/W4408353947"],"related_works":[],"abstract_inverted_index":{"Language-queried":[0],"Audio":[1],"Separation":[2],"(LASS)":[3],"employs":[4],"linguistic":[5,26],"queries":[6],"to":[7,111],"isolate":[8],"target":[9],"sounds":[10],"based":[11],"on":[12,37],"semantic":[13,86],"descriptions.":[14],"However,":[15],"existing":[16],"methods":[17],"face":[18],"challenges":[19],"in":[20,142],"aligning":[21],"complex":[22],"auditory":[23],"features":[24],"with":[25,84],"context":[27],"while":[28,107],"preserving":[29],"separation":[30,113],"precision.":[31],"Current":[32],"research":[33],"efforts":[34],"focus":[35],"primarily":[36],"text":[38],"description":[39],"augmentation":[40],"and":[41,55],"architectural":[42],"innovations,":[43],"yet":[44],"the":[45],"potential":[46],"of":[47,62],"integrating":[48,108],"pre-trained":[49],"self-supervised":[50],"learning":[51],"(SSL)":[52],"audio":[53],"models":[54],"Contrastive":[56],"Language-Audio":[57],"Pretraining":[58],"(CLAP)":[59],"frameworks,":[60],"capable":[61],"extracting":[63],"cross-modal":[64],"audio-text":[65],"relationships,":[66],"remains":[67],"underexplored.":[68],"To":[69],"address":[70],"this,":[71],"we":[72],"present":[73],"HybridSep,":[74],"a":[75,95],"two-stage":[76],"LASS":[77,136],"framework":[78,89],"that":[79,99,117],"synergizes":[80],"SSL-based":[81],"acoustic":[82],"representations":[83],"CLAP-derived":[85],"embeddings.":[87],"Our":[88],"introduces":[90],"Adversarial":[91],"Consistent":[92],"Training":[93],"(ACT),":[94],"novel":[96],"optimization":[97],"strategy":[98],"treats":[100],"diffusion":[101],"as":[102],"an":[103],"auxiliary":[104],"regularization":[105],"loss":[106],"adversarial":[109],"training":[110],"enhance":[112],"fidelity.":[114],"Experiments":[115],"demonstrate":[116],"HybridSep":[118],"achieves":[119],"significant":[120],"performance":[121],"improvements":[122],"over":[123],"state-of-the-art":[124],"baselines":[125],"(e.g.,":[126],"AudioSep,":[127],"FlowSep)":[128],"across":[129],"multiple":[130],"metrics,":[131],"establishing":[132],"new":[133],"benchmarks":[134],"for":[135],"tasks.":[137],"Demo":[138],"can":[139],"be":[140],"find":[141],"web.<sup":[143],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[144],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[145]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2025-11-14T00:00:00"}
