{"id":"https://openalex.org/W4408355294","doi":"https://doi.org/10.1109/icassp49660.2025.10888410","title":"SoundBeam meets M2D: Target Sound Extraction with Audio Foundation Model","display_name":"SoundBeam meets M2D: Target Sound Extraction with Audio Foundation Model","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408355294","doi":"https://doi.org/10.1109/icassp49660.2025.10888410"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10888410","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10888410","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5048090174","display_name":"Carlos Hernandez-Olivan","orcid":"https://orcid.org/0000-0002-0235-2267"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Carlos Hernandez-Olivan","raw_affiliation_strings":["NTT Corporation,Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023868166","display_name":"Marc Delcroix","orcid":"https://orcid.org/0000-0002-5175-7834"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Marc Delcroix","raw_affiliation_strings":["NTT Corporation,Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070624983","display_name":"Tsubasa Ochiai","orcid":"https://orcid.org/0000-0002-2519-2032"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Tsubasa Ochiai","raw_affiliation_strings":["NTT Corporation,Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091219538","display_name":"Daisuke Niizumi","orcid":"https://orcid.org/0000-0002-5063-0508"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Daisuke Niizumi","raw_affiliation_strings":["NTT Corporation,Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073111123","display_name":"Naohiro Tawara","orcid":null},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Naohiro Tawara","raw_affiliation_strings":["NTT Corporation,Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021240106","display_name":"Tomohiro Nakatani","orcid":"https://orcid.org/0000-0002-7487-7150"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Tomohiro Nakatani","raw_affiliation_strings":["NTT Corporation,Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5009309584","display_name":"Shoko Araki","orcid":"https://orcid.org/0000-0003-4363-4305"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shoko Araki","raw_affiliation_strings":["NTT Corporation,Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,Japan","institution_ids":["https://openalex.org/I2251713219"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5048090174"],"corresponding_institution_ids":["https://openalex.org/I2251713219"],"apc_list":null,"apc_paid":null,"fwci":7.6696,"has_fulltext":false,"cited_by_count":6,"citation_normalized_percentile":{"value":0.97215397,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9855999946594238,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9855999946594238,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9787999987602234,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9760000109672546,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6798962354660034},{"id":"https://openalex.org/keywords/sound","display_name":"Sound (geography)","score":0.5414285063743591},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.35779625177383423},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.3183223009109497},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.05853831768035889}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6798962354660034},{"id":"https://openalex.org/C203718221","wikidata":"https://www.wikidata.org/wiki/Q491713","display_name":"Sound (geography)","level":2,"score":0.5414285063743591},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.35779625177383423},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.3183223009109497},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.05853831768035889}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10888410","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10888410","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320338124","display_name":"Strategic International Collaborative Research Program","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W2052666245","https://openalex.org/W2108668360","https://openalex.org/W2593116425","https://openalex.org/W2771361008","https://openalex.org/W2952218014","https://openalex.org/W2962865004","https://openalex.org/W3094550259","https://openalex.org/W3095263845","https://openalex.org/W3160972914","https://openalex.org/W3196974791","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4205689591","https://openalex.org/W4224929689","https://openalex.org/W4224933800","https://openalex.org/W4281492411","https://openalex.org/W4310873011","https://openalex.org/W4310873168","https://openalex.org/W4313156423","https://openalex.org/W4319862271","https://openalex.org/W4372260310","https://openalex.org/W4372348953","https://openalex.org/W4387835382","https://openalex.org/W4392903066","https://openalex.org/W4392904475","https://openalex.org/W4394862844","https://openalex.org/W4404317174","https://openalex.org/W4404609496","https://openalex.org/W6752516136","https://openalex.org/W6766320909","https://openalex.org/W6784333009","https://openalex.org/W6847363464","https://openalex.org/W7024468878"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W2909726438"],"abstract_inverted_index":{"Target":[0],"sound":[1,9,141,176],"extraction":[2,146,195],"(TSE)":[3],"consists":[4],"of":[5,13,53,57,64,98,129,148],"isolating":[6],"a":[7,11,71,87,101,125,152,167],"desired":[8],"from":[10,39,75,161],"mixture":[12],"arbitrary":[14],"sounds":[15,65,99],"using":[16,86,124,191],"clues":[17],"to":[18,69,80,140],"identify":[19],"it.":[20],"A":[21],"TSE":[22,73,102,118,154,169],"system":[23,47,74,155,170],"requires":[24],"solving":[25],"two":[26],"problems":[27,147],"at":[28],"once,":[29],"identifying":[30],"the":[31,36,40,45,58,61,106,117,144,158],"target":[32,37,175],"source":[33],"and":[34,60,132,143,179],"extracting":[35],"signal":[38,145],"mixture.":[41],"For":[42],"increased":[43],"practicability,":[44],"same":[46],"should":[48],"work":[49],"with":[50],"various":[51],"types":[52],"sound.":[54],"The":[55],"duality":[56],"problem":[59],"wide":[62],"variety":[63],"make":[66],"it":[67,121],"challenging":[68],"train":[70],"powerful":[72],"scratch.":[76],"In":[77],"this":[78,82],"paper,":[79],"tackle":[81],"problem,":[83],"we":[84],"explore":[85],"pre-trained":[88],"audio":[89,183],"foundation":[90,110],"model":[91],"that":[92,156,171,190],"can":[93,172,193],"provide":[94],"rich":[95],"feature":[96,159],"representations":[97],"within":[100],"system.":[103],"We":[104,150,187],"chose":[105],"masked-modeling":[107],"duo":[108],"(M2D)":[109],"model,":[111],"which":[112,165],"appears":[113],"especially":[114,197],"suited":[115],"for":[116],"task,":[119],"as":[120,185],"is":[122,166],"trained":[123],"dual":[126],"objective":[127],"consisting":[128],"sound-label":[130],"predictions":[131],"improved":[133],"masked":[134],"prediction.":[135],"These":[136],"objectives":[137],"are":[138],"related":[139],"identification":[142],"TSE.":[149],"propose":[151],"new":[153],"integrates":[157],"representation":[160],"M2D":[162,192],"into":[163],"SoundBeam,":[164],"strong":[168],"exploit":[173],"both":[174],"class":[177],"labels":[178],"pre-recorded":[180],"enrollments":[181],"(or":[182],"queries)":[184],"clues.":[186,201],"show":[188],"experimentally":[189],"increase":[194],"performance,":[196],"when":[198],"employing":[199],"enrollment":[200]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":5}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
