{"id":"https://openalex.org/W4404037281","doi":"https://doi.org/10.1109/mlsp58920.2024.10734819","title":"YourMT3+: Multi-Instrument Music Transcription with Enhanced Transformer Architectures and Cross-Dataset STEM Augmentation","display_name":"YourMT3+: Multi-Instrument Music Transcription with Enhanced Transformer Architectures and Cross-Dataset STEM Augmentation","publication_year":2024,"publication_date":"2024-09-22","ids":{"openalex":"https://openalex.org/W4404037281","doi":"https://doi.org/10.1109/mlsp58920.2024.10734819"},"language":"en","primary_location":{"id":"doi:10.1109/mlsp58920.2024.10734819","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mlsp58920.2024.10734819","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 34th International Workshop on Machine Learning for Signal Processing (MLSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027945402","display_name":"Sungkyun Chang","orcid":"https://orcid.org/0000-0001-8616-6631"},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Sungkyun Chang","raw_affiliation_strings":["Queen Mary University of London,Centre for Digital Music"],"affiliations":[{"raw_affiliation_string":"Queen Mary University of London,Centre for Digital Music","institution_ids":["https://openalex.org/I166337079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084672392","display_name":"Emmanouil Benetos","orcid":null},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Emmanouil Benetos","raw_affiliation_strings":["Queen Mary University of London,Centre for Digital Music"],"affiliations":[{"raw_affiliation_string":"Queen Mary University of London,Centre for Digital Music","institution_ids":["https://openalex.org/I166337079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074883610","display_name":"Holger Kirchhoff","orcid":"https://orcid.org/0009-0008-4655-729X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Holger Kirchhoff","raw_affiliation_strings":["Huawei"],"affiliations":[{"raw_affiliation_string":"Huawei","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5053652066","display_name":"Simon Dixon","orcid":"https://orcid.org/0000-0002-8886-7666"},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Simon Dixon","raw_affiliation_strings":["Queen Mary University of London,Centre for Digital Music"],"affiliations":[{"raw_affiliation_string":"Queen Mary University of London,Centre for Digital Music","institution_ids":["https://openalex.org/I166337079"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5027945402"],"corresponding_institution_ids":["https://openalex.org/I166337079"],"apc_list":null,"apc_paid":null,"fwci":1.8752,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.87043581,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13996","display_name":"Diverse Musicological Studies","score":0.989799976348877,"subfield":{"id":"https://openalex.org/subfields/1210","display_name":"Music"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9753999710083008,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.625813364982605},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5714429020881653},{"id":"https://openalex.org/keywords/transcription","display_name":"Transcription (linguistics)","score":0.46348267793655396},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.22130894660949707},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.20775997638702393}],"concepts":[{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.625813364982605},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5714429020881653},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.46348267793655396},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.22130894660949707},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.20775997638702393},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/mlsp58920.2024.10734819","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mlsp58920.2024.10734819","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 34th International Workshop on Machine Learning for Signal Processing (MLSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W2110007838","https://openalex.org/W2759171953","https://openalex.org/W2801401223","https://openalex.org/W2906214917","https://openalex.org/W2963535133","https://openalex.org/W3037149862","https://openalex.org/W3095303521","https://openalex.org/W3160649916","https://openalex.org/W3206146520","https://openalex.org/W4238553276","https://openalex.org/W4372260308","https://openalex.org/W4388979610","https://openalex.org/W4389519385","https://openalex.org/W4391640266","https://openalex.org/W6631117800","https://openalex.org/W6712576946","https://openalex.org/W6714030504","https://openalex.org/W6730401039","https://openalex.org/W6751104502","https://openalex.org/W6755182157","https://openalex.org/W6756341328","https://openalex.org/W6757817989","https://openalex.org/W6769627184","https://openalex.org/W6790830454","https://openalex.org/W6803088475","https://openalex.org/W6810809420","https://openalex.org/W6848751409","https://openalex.org/W6851857987","https://openalex.org/W6854061045","https://openalex.org/W6860710830"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Multi-instrument":[0],"music":[1,7,60,151],"transcription":[2,61,122,145],"aims":[3],"to":[4,13,44],"convert":[5],"polyphonic":[6],"recordings":[8,152],"into":[9],"musical":[10],"scores":[11],"assigned":[12],"each":[14],"instrument.":[15],"This":[16,48],"task":[17],"is":[18],"challenging":[19],"for":[20,57,103,114,127],"modeling":[21],"as":[22],"it":[23],"requires":[24],"simultaneously":[25],"identifying":[26],"multiple":[27],"instruments":[28],"and":[29,33,36,86,108,111,162],"transcribing":[30],"their":[31],"pitch":[32],"precise":[34],"timing,":[35],"the":[37,45,64,83,125,154],"lack":[38],"of":[39,55,70,90,156],"fully":[40],"annotated":[41],"data":[42,94],"adds":[43],"training":[46,104],"difficulties.":[47],"paper":[49],"introduces":[50],"Y":[51],"ourMT3+,":[52],"a":[53,78,88,98],"suite":[54],"models":[56],"enhanced":[58],"multi-instrument":[59],"based":[62],"on":[63,149],"recent":[65],"language":[66],"token":[67],"decoding":[68,101],"approach":[69],"MT3.":[71],"We":[72],"enhance":[73],"its":[74],"encoder":[75],"by":[76],"adopting":[77],"hierarchical":[79],"attention":[80],"transformer":[81],"in":[82],"time-frequency":[84],"domain":[85],"integrating":[87],"mixture":[89],"experts.":[91],"To":[92],"address":[93],"limitations,":[95],"we":[96],"introduce":[97],"new":[99],"multi-channel":[100],"method":[102],"with":[105,166],"incomplete":[106],"annotations":[107],"propose":[109],"intra-":[110],"cross-stem":[112],"augmentation":[113],"dataset":[115],"mixing.":[116],"Our":[117],"experiments":[118],"demonstrate":[119],"direct":[120],"vocal":[121],"capabilities,":[123],"eliminating":[124],"need":[126],"voice":[128],"separation":[129],"pre-processors.":[130],"Benchmarks":[131],"across":[132],"ten":[133],"public":[134],"datasets":[135,163],"show":[136],"our":[137],"models'":[138],"competitiveness":[139],"with,":[140],"or":[141],"superiority":[142],"to,":[143],"existing":[144],"models.":[146,158],"Further":[147],"testing":[148],"pop":[150],"highlights":[153],"limitations":[155],"current":[157],"Fully":[159],"reproducible":[160],"code":[161],"are":[164],"available":[165],"demos":[167],"at":[168],"https://github.com/mimbres/YourMT3.":[169]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":4}],"updated_date":"2025-12-27T23:08:20.325037","created_date":"2025-10-10T00:00:00"}
