{"id":"https://openalex.org/W7108692974","doi":"https://doi.org/10.5281/zenodo.17811567","title":"Fx-Encoder++: Extracting Instrument-Wise Audio Effects Representations from Mixtures","display_name":"Fx-Encoder++: Extracting Instrument-Wise Audio Effects Representations from Mixtures","publication_year":2025,"publication_date":"2025-09-21","ids":{"openalex":"https://openalex.org/W7108692974","doi":"https://doi.org/10.5281/zenodo.17811567"},"language":null,"primary_location":{"id":"doi:10.5281/zenodo.17811567","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811567","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.5281/zenodo.17811567","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yen-Tung Yeh","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yen-Tung Yeh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Junghyun Koo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Junghyun Koo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Marco Mart\u00ednez-Ram\u00edrez","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Marco Mart\u00ednez-Ram\u00edrez","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wei-Hsiang Liao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei-Hsiang Liao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yi-Hsuan  Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yi-Hsuan  Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Yuki Mitsufuji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuki Mitsufuji","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.63054339,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9904000163078308,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9904000163078308,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.003599999938160181,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10788","display_name":"Neuroscience and Music Perception","score":0.0007999999797903001,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/music-information-retrieval","display_name":"Music information retrieval","score":0.603600025177002},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5587000250816345},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5353000164031982},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.44929999113082886},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.44620001316070557},{"id":"https://openalex.org/keywords/audio-analyzer","display_name":"Audio analyzer","score":0.390500009059906},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.3847000002861023},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.37119999527931213}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7483000159263611},{"id":"https://openalex.org/C2777946086","wikidata":"https://www.wikidata.org/wiki/Q1163335","display_name":"Music information retrieval","level":3,"score":0.603600025177002},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5587000250816345},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5353000164031982},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5012000203132629},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.44929999113082886},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.44620001316070557},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.44179999828338623},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.400299996137619},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.390500009059906},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.3847000002861023},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.37119999527931213},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.3474000096321106},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.30809998512268066},{"id":"https://openalex.org/C87687168","wikidata":"https://www.wikidata.org/wiki/Q173114","display_name":"Digital audio","level":4,"score":0.3075000047683716},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.2957000136375427},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.2822999954223633},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2784999907016754},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.27140000462532043},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26840001344680786},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.262800008058548},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.2531000077724457}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5281/zenodo.17811567","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811567","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":""}],"best_oa_location":{"id":"doi:10.5281/zenodo.17811567","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811567","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"sustainable_development_goals":[{"score":0.4106478691101074,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"General-purpose":[0],"audio":[1,25,33,48,68,97,102,112],"representations":[2,70],"have":[3,31],"proven":[4],"effective":[5],"across":[6,109,120],"diverse":[7,122],"music":[8,17,72,154],"information":[9],"retrieval":[10,110],"applications,":[11],"yet":[12],"their":[13],"utility":[14],"in":[15,152],"intelligent":[16,153],"production":[18,155],"remains":[19],"limited":[20],"by":[21],"insufficient":[22],"understanding":[23],"of":[24,124],"effects":[26,34,49,69,144],"(Fx).":[27],"Although":[28],"previous":[29,132],"approaches":[30,133],"emphasized":[32],"analysis":[35],"at":[36,134],"the":[37],"mixture":[38,135],"level,":[39],"this":[40,56],"focus":[41],"falls":[42],"short":[43],"for":[44],"tasks":[45],"demanding":[46],"instrument-wise":[47,67,101],"understanding,":[50],"such":[51],"as":[52],"automatic":[53],"mixing.":[54],"In":[55],"work,":[57],"we":[58],"present":[59],"Fx-Encoder++,":[60],"a":[61,77,121,139,148],"novel":[62,140],"model":[63,108],"designed":[64],"to":[65,142],"extract":[66,143],"from":[71],"mixtures.":[73],"Our":[74],"approach":[75],"leverages":[76],"contrastive":[78],"learning":[79],"framework":[80],"and":[81,111,137],"introduces":[82],"an":[83],"``extractor''":[84],"mechanism":[85],"that,":[86],"when":[87],"provided":[88],"with":[89],"instrument":[90],"queries":[91],"(audio":[92],"or":[93],"text),":[94],"transforms":[95],"mixture-level":[96],"effect":[98,103,113],"embeddings":[99],"into":[100],"embeddings.":[104],"We":[105],"evaluated":[106],"our":[107],"parameter":[114],"matching":[115],"tasks,":[116],"testing":[117],"its":[118],"performance":[119],"range":[123],"instruments.":[125],"The":[126],"results":[127],"demonstrate":[128],"that":[129],"Fx-Encoder++":[130],"outperforms":[131],"level":[136],"show":[138],"ability":[141],"representation":[145],"instrument-wise,":[146],"addressing":[147],"critical":[149],"capability":[150],"gap":[151],"systems.":[156]},"counts_by_year":[],"updated_date":"2025-12-05T23:25:22.460635","created_date":"2025-12-05T00:00:00"}
