{"id":"https://openalex.org/W4392903849","doi":"https://doi.org/10.1109/icassp48485.2024.10446141","title":"Timbre-Trap: A Low-Resource Framework for Instrument-Agnostic Music Transcription","display_name":"Timbre-Trap: A Low-Resource Framework for Instrument-Agnostic Music Transcription","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392903849","doi":"https://doi.org/10.1109/icassp48485.2024.10446141"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446141","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446141","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5074979798","display_name":"Frank Cwitkowitz","orcid":null},"institutions":[{"id":"https://openalex.org/I5388228","display_name":"University of Rochester","ror":"https://ror.org/022kthw22","country_code":"US","type":"education","lineage":["https://openalex.org/I5388228"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Frank Cwitkowitz","raw_affiliation_strings":["University of Rochester"],"affiliations":[{"raw_affiliation_string":"University of Rochester","institution_ids":["https://openalex.org/I5388228"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069612434","display_name":"Kin Wai Cheuk","orcid":"https://orcid.org/0000-0003-3213-8242"},"institutions":[{"id":"https://openalex.org/I2800278093","display_name":"Sony Corporation (United States)","ror":"https://ror.org/05k91zb11","country_code":"US","type":"company","lineage":["https://openalex.org/I2800278093"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kin Wai Cheuk","raw_affiliation_strings":["Sony AI"],"affiliations":[{"raw_affiliation_string":"Sony AI","institution_ids":["https://openalex.org/I2800278093"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102881663","display_name":"Woosung Choi","orcid":"https://orcid.org/0000-0003-2638-2097"},"institutions":[{"id":"https://openalex.org/I2800278093","display_name":"Sony Corporation (United States)","ror":"https://ror.org/05k91zb11","country_code":"US","type":"company","lineage":["https://openalex.org/I2800278093"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Woosung Choi","raw_affiliation_strings":["Sony AI"],"affiliations":[{"raw_affiliation_string":"Sony AI","institution_ids":["https://openalex.org/I2800278093"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017282176","display_name":"Marco A. Mart\u00ednez-Ram\u00edrez","orcid":null},"institutions":[{"id":"https://openalex.org/I2800278093","display_name":"Sony Corporation (United States)","ror":"https://ror.org/05k91zb11","country_code":"US","type":"company","lineage":["https://openalex.org/I2800278093"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Marco A. Mart\u00ednez-Ram\u00edrez","raw_affiliation_strings":["Sony AI"],"affiliations":[{"raw_affiliation_string":"Sony AI","institution_ids":["https://openalex.org/I2800278093"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050224316","display_name":"Keisuke Toyama","orcid":"https://orcid.org/0000-0003-3632-2530"},"institutions":[{"id":"https://openalex.org/I2800278093","display_name":"Sony Corporation (United States)","ror":"https://ror.org/05k91zb11","country_code":"US","type":"company","lineage":["https://openalex.org/I2800278093"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Keisuke Toyama","raw_affiliation_strings":["Sony Group Corporation"],"affiliations":[{"raw_affiliation_string":"Sony Group Corporation","institution_ids":["https://openalex.org/I2800278093"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103537853","display_name":"Wei\u2010Hsiang Liao","orcid":"https://orcid.org/0000-0003-4113-1894"},"institutions":[{"id":"https://openalex.org/I2800278093","display_name":"Sony Corporation (United States)","ror":"https://ror.org/05k91zb11","country_code":"US","type":"company","lineage":["https://openalex.org/I2800278093"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Wei-Hsiang Liao","raw_affiliation_strings":["Sony AI"],"affiliations":[{"raw_affiliation_string":"Sony AI","institution_ids":["https://openalex.org/I2800278093"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5088754502","display_name":"Yuki Mitsufuji","orcid":"https://orcid.org/0000-0002-6806-6140"},"institutions":[{"id":"https://openalex.org/I2800278093","display_name":"Sony Corporation (United States)","ror":"https://ror.org/05k91zb11","country_code":"US","type":"company","lineage":["https://openalex.org/I2800278093"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yuki Mitsufuji","raw_affiliation_strings":["Sony AI","Sony Group Corporation"],"affiliations":[{"raw_affiliation_string":"Sony AI","institution_ids":["https://openalex.org/I2800278093"]},{"raw_affiliation_string":"Sony Group Corporation","institution_ids":["https://openalex.org/I2800278093"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5074979798"],"corresponding_institution_ids":["https://openalex.org/I5388228"],"apc_list":null,"apc_paid":null,"fwci":2.2502,"has_fulltext":false,"cited_by_count":6,"citation_normalized_percentile":{"value":0.87841311,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1291","last_page":"1295"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/timbre","display_name":"Timbre","score":0.9032310247421265},{"id":"https://openalex.org/keywords/salience","display_name":"Salience (neuroscience)","score":0.7401098012924194},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7374491095542908},{"id":"https://openalex.org/keywords/transcription","display_name":"Transcription (linguistics)","score":0.6263286471366882},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5376132130622864},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.5261912941932678},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.33391883969306946},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.12112978100776672}],"concepts":[{"id":"https://openalex.org/C2776539107","wikidata":"https://www.wikidata.org/wiki/Q176501","display_name":"Timbre","level":3,"score":0.9032310247421265},{"id":"https://openalex.org/C108154423","wikidata":"https://www.wikidata.org/wiki/Q1469792","display_name":"Salience (neuroscience)","level":2,"score":0.7401098012924194},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7374491095542908},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.6263286471366882},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5376132130622864},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.5261912941932678},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33391883969306946},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.12112978100776672},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C153349607","wikidata":"https://www.wikidata.org/wiki/Q36649","display_name":"Visual arts","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446141","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446141","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W129413713","https://openalex.org/W2076608692","https://openalex.org/W2107328928","https://openalex.org/W2150279609","https://openalex.org/W2187089797","https://openalex.org/W2198584637","https://openalex.org/W2523056285","https://openalex.org/W2740337320","https://openalex.org/W2759171953","https://openalex.org/W2760103357","https://openalex.org/W2778340325","https://openalex.org/W2906214917","https://openalex.org/W3015247127","https://openalex.org/W3095303521","https://openalex.org/W3101943858","https://openalex.org/W3160787318","https://openalex.org/W3206146520","https://openalex.org/W3215615641","https://openalex.org/W4225281045","https://openalex.org/W4292826157","https://openalex.org/W4318751639","https://openalex.org/W4372260308","https://openalex.org/W4372341905","https://openalex.org/W4386158734","https://openalex.org/W6697040288","https://openalex.org/W6697316635","https://openalex.org/W6714030504","https://openalex.org/W6732646663","https://openalex.org/W6746836464","https://openalex.org/W6747336215","https://openalex.org/W6755182157","https://openalex.org/W6756341328","https://openalex.org/W6771763809","https://openalex.org/W6784488536","https://openalex.org/W6803088475","https://openalex.org/W6849446024"],"related_works":["https://openalex.org/W2406877384","https://openalex.org/W2595839522","https://openalex.org/W2124576126","https://openalex.org/W2138455584","https://openalex.org/W1984264321","https://openalex.org/W3109975354","https://openalex.org/W2357614930","https://openalex.org/W2008642382","https://openalex.org/W2028097510","https://openalex.org/W2505877856"],"abstract_inverted_index":{"In":[0,113],"recent":[1],"years,":[2],"research":[3],"on":[4,10,51],"music":[5,71],"transcription":[6,41,72,145],"has":[7],"focused":[8],"mainly":[9],"architecture":[11],"design":[12],"and":[13,73,83,95],"instrument-specific":[14],"data":[15,60],"acquisition.":[16],"With":[17],"the":[18,47,58,78,105,116,136],"lack":[19],"of":[20,22,49,153],"availability":[21,61],"diverse":[23],"datasets,":[24],"progress":[25],"is":[26],"often":[27],"limited":[28],"to":[29,45,90,119,123,139,142],"solo-instrument":[30],"tasks":[31],"such":[32],"as":[33,42,130],"piano":[34],"transcription.":[35],"Several":[36],"works":[37],"have":[38],"explored":[39],"multi-instrument":[40],"a":[43,66,87,109,150],"means":[44],"bolster":[46],"performance":[48,140],"models":[50],"low-resource":[52],"tasks,":[53],"but":[54],"these":[55],"methods":[56],"face":[57],"same":[59],"issues.":[62],"We":[63,85,133],"propose":[64],"Timbre-Trap,":[65],"novel":[67],"framework":[68,137],"which":[69,126],"unifies":[70],"audio":[74],"reconstruction":[75],"by":[76],"exploiting":[77],"strong":[79],"separability":[80],"between":[81,101],"pitch":[82,93,131],"timbre.":[84],"train":[86],"single":[88],"autoencoder":[89],"simultaneously":[91],"estimate":[92],"salience":[94],"reconstruct":[96],"complex":[97],"spectral":[98],"coefficients,":[99],"selecting":[100],"either":[102],"output":[103],"during":[104],"decoding":[106],"stage":[107],"via":[108],"simple":[110],"switch":[111],"mechanism.":[112],"this":[114],"way,":[115],"model":[117],"learns":[118],"produce":[120],"coefficients":[121],"corresponding":[122],"timbre-less":[124],"audio,":[125],"can":[127],"be":[128],"interpreted":[129],"salience.":[132],"demonstrate":[134],"that":[135],"leads":[138],"comparable":[141],"state-of-the-art":[143],"instrument-agnostic":[144],"methods,":[146],"while":[147],"only":[148],"requiring":[149],"small":[151],"amount":[152],"annotated":[154],"data.":[155]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
