{"id":"https://openalex.org/W4416250966","doi":"https://doi.org/10.1109/waspaa66052.2025.11230990","title":"RUMAA: Repeat-Aware Unified Music Audio Analysis for Score-Performance Alignment, Transcription, and Mistake Detection","display_name":"RUMAA: Repeat-Aware Unified Music Audio Analysis for Score-Performance Alignment, Transcription, and Mistake Detection","publication_year":2025,"publication_date":"2025-10-12","ids":{"openalex":"https://openalex.org/W4416250966","doi":"https://doi.org/10.1109/waspaa66052.2025.11230990"},"language":null,"primary_location":{"id":"doi:10.1109/waspaa66052.2025.11230990","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230990","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027945402","display_name":"Sungkyun Chang","orcid":"https://orcid.org/0000-0001-8616-6631"},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Sungkyun Chang","raw_affiliation_strings":["Queen Mary University of London,Centre for Digital Music,UK"],"affiliations":[{"raw_affiliation_string":"Queen Mary University of London,Centre for Digital Music,UK","institution_ids":["https://openalex.org/I166337079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053652066","display_name":"Simon Dixon","orcid":"https://orcid.org/0000-0002-8886-7666"},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Simon Dixon","raw_affiliation_strings":["Queen Mary University of London,Centre for Digital Music,UK"],"affiliations":[{"raw_affiliation_string":"Queen Mary University of London,Centre for Digital Music,UK","institution_ids":["https://openalex.org/I166337079"]}]},{"author_position":"last","author":{"id":null,"display_name":"Emmanouil Benetos","orcid":null},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Emmanouil Benetos","raw_affiliation_strings":["Queen Mary University of London,Centre for Digital Music,UK"],"affiliations":[{"raw_affiliation_string":"Queen Mary University of London,Centre for Digital Music,UK","institution_ids":["https://openalex.org/I166337079"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5027945402"],"corresponding_institution_ids":["https://openalex.org/I166337079"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.45341941,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.8687999844551086,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.8687999844551086,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.09629999846220016,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10788","display_name":"Neuroscience and Music Perception","score":0.013299999758601189,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/mistake","display_name":"Mistake","score":0.9097999930381775},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.512499988079071},{"id":"https://openalex.org/keywords/transcription","display_name":"Transcription (linguistics)","score":0.4602999985218048},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4553000032901764},{"id":"https://openalex.org/keywords/piano","display_name":"Piano","score":0.41429999470710754},{"id":"https://openalex.org/keywords/interdependence","display_name":"Interdependence","score":0.40220001339912415},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.39559999108314514}],"concepts":[{"id":"https://openalex.org/C2777179996","wikidata":"https://www.wikidata.org/wiki/Q911222","display_name":"Mistake","level":2,"score":0.9097999930381775},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7283999919891357},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6320000290870667},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.512499988079071},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.4602999985218048},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4553000032901764},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41830000281333923},{"id":"https://openalex.org/C124086623","wikidata":"https://www.wikidata.org/wiki/Q5994","display_name":"Piano","level":2,"score":0.41429999470710754},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4034999907016754},{"id":"https://openalex.org/C185874996","wikidata":"https://www.wikidata.org/wiki/Q269699","display_name":"Interdependence","level":2,"score":0.40220001339912415},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.39559999108314514},{"id":"https://openalex.org/C2777946086","wikidata":"https://www.wikidata.org/wiki/Q1163335","display_name":"Music information retrieval","level":3,"score":0.36039999127388},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.32089999318122864},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.30399999022483826},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2935999929904938},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.2874000072479248},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.27559998631477356},{"id":"https://openalex.org/C2780148112","wikidata":"https://www.wikidata.org/wiki/Q1432581","display_name":"Proxy (statistics)","level":2,"score":0.2624000012874603},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.2533000111579895}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/waspaa66052.2025.11230990","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230990","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W2006657628","https://openalex.org/W2091921805","https://openalex.org/W2478051194","https://openalex.org/W2728379535","https://openalex.org/W2906259846","https://openalex.org/W2964110616","https://openalex.org/W3162873781","https://openalex.org/W3175663427","https://openalex.org/W3183561830","https://openalex.org/W4255574772","https://openalex.org/W4382021246","https://openalex.org/W4389519385","https://openalex.org/W4404037281","https://openalex.org/W4409348283","https://openalex.org/W4411112969","https://openalex.org/W4416248321"],"related_works":[],"abstract_inverted_index":{"This":[0],"study":[1],"introduces":[2],"RUMAA,":[3],"a":[4,21,42,95],"transformer-based":[5],"framework":[6],"for":[7],"music":[8,98],"performance":[9,62],"analysis":[10],"that":[11,68],"unifies":[12],"score-to-performance":[13],"alignment,":[14],"score-informed":[15],"transcription,":[16],"and":[17,38,41,87,105],"mistake":[18,106],"detection":[19,107],"in":[20,94],"near":[22],"end-to-end":[23],"manner.":[24],"Unlike":[25],"prior":[26],"methods":[27,67,83],"addressing":[28],"these":[29],"tasks":[30],"separately,":[31],"RUMAA":[32,79],"integrates":[33],"them":[34,89],"using":[35],"pre-trained":[36],"score":[37],"audio":[39],"encoders":[40],"novel":[43],"tri-stream":[44],"decoder":[45],"capturing":[46],"task":[47],"interdependencies":[48],"through":[49],"proxy":[50],"tasks.":[51],"It":[52],"aligns":[53],"human-readable":[54],"MusicXML":[55],"scores":[56,86,91],"with":[57,75,92],"repeat":[58,77],"symbols":[59],"to":[60],"full-length":[61],"audio,":[63],"overcoming":[64],"traditional":[65],"MIDI-based":[66],"rely":[69],"on":[70,84,90],"manually":[71],"unfolded":[72],"score-MIDI":[73],"data":[74],"pre-specified":[76],"structures.":[78],"matches":[80],"state-of-the-art":[81],"alignment":[82],"non-repeated":[85],"outperforms":[88],"repeats":[93],"public":[96],"piano":[97],"dataset,":[99],"while":[100],"also":[101],"delivering":[102],"promising":[103],"transcription":[104],"results.":[108]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-14T00:00:00"}
