{"id":"https://openalex.org/W4392705114","doi":"https://doi.org/10.1145/3651310","title":"Automatic Lyric Transcription and Automatic Music Transcription from Multimodal Singing","display_name":"Automatic Lyric Transcription and Automatic Music Transcription from Multimodal Singing","publication_year":2024,"publication_date":"2024-03-12","ids":{"openalex":"https://openalex.org/W4392705114","doi":"https://doi.org/10.1145/3651310"},"language":"en","primary_location":{"id":"doi:10.1145/3651310","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3651310","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3651310","source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3651310","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5042584368","display_name":"Xiangming Gu","orcid":"https://orcid.org/0000-0003-0637-8664"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Xiangming Gu","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0003-0637-8664","affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003653417","display_name":"Longshen Ou","orcid":"https://orcid.org/0000-0002-1725-8361"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Longshen Ou","raw_affiliation_strings":["National University of Singapore, Singapore Singapore"],"raw_orcid":"https://orcid.org/0000-0002-1725-8361","affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100660203","display_name":"Wei Zeng","orcid":"https://orcid.org/0000-0002-0953-5314"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Wei Zeng","raw_affiliation_strings":["National University of Singapore, Singapore Singapore"],"raw_orcid":"https://orcid.org/0000-0002-0953-5314","affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100460583","display_name":"Jianan Zhang","orcid":"https://orcid.org/0000-0002-4969-1880"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Jianan Zhang","raw_affiliation_strings":["National University of Singapore, Singapore Singapore"],"raw_orcid":"https://orcid.org/0000-0002-4969-1880","affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101966036","display_name":"N. H. Wong","orcid":"https://orcid.org/0009-0004-6473-6938"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Nicholas Wong","raw_affiliation_strings":["National University of Singapore, Singapore Singapore"],"raw_orcid":"https://orcid.org/0009-0004-6473-6938","affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100423435","display_name":"Ye Wang","orcid":"https://orcid.org/0000-0002-0123-1260"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Ye Wang","raw_affiliation_strings":["National University of Singapore, Singapore Singapore"],"raw_orcid":"https://orcid.org/0000-0002-0123-1260","affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore Singapore","institution_ids":["https://openalex.org/I165932596"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I165932596"],"apc_list":null,"apc_paid":null,"fwci":2.6952,"has_fulltext":true,"cited_by_count":9,"citation_normalized_percentile":{"value":0.90598517,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":"20","issue":"7","first_page":"1","last_page":"29"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9951000213623047,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transcription","display_name":"Transcription (linguistics)","score":0.8218954801559448},{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.7831796407699585},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5672047138214111},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.3871367275714874},{"id":"https://openalex.org/keywords/communication","display_name":"Communication","score":0.3212640881538391},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.2151007354259491},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.17703881859779358},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.08410075306892395}],"concepts":[{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.8218954801559448},{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.7831796407699585},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5672047138214111},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.3871367275714874},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.3212640881538391},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.2151007354259491},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.17703881859779358},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.08410075306892395},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3651310","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3651310","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3651310","source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3651310","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3651310","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3651310","source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.4399999976158142,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G7835538620","display_name":null,"funder_award_id":"MOE-T2EP20120-0012","funder_id":"https://openalex.org/F4320320751","funder_display_name":"Ministry of Education - Singapore"}],"funders":[{"id":"https://openalex.org/F4320320751","display_name":"Ministry of Education - Singapore","ror":"https://ror.org/01kcva023"},{"id":"https://openalex.org/F4320322724","display_name":"Ministry of Education, India","ror":"https://ror.org/048xjjh50"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4392705114.pdf","grobid_xml":"https://content.openalex.org/works/W4392705114.grobid-xml"},"referenced_works_count":49,"referenced_works":["https://openalex.org/W395933519","https://openalex.org/W1494198834","https://openalex.org/W1522301498","https://openalex.org/W1990449877","https://openalex.org/W1990908299","https://openalex.org/W1993064527","https://openalex.org/W1998849596","https://openalex.org/W2015394094","https://openalex.org/W2017251347","https://openalex.org/W2077132865","https://openalex.org/W2082511635","https://openalex.org/W2091425152","https://openalex.org/W2118774185","https://openalex.org/W2144827818","https://openalex.org/W2194775991","https://openalex.org/W2407685581","https://openalex.org/W2541711215","https://openalex.org/W2605311049","https://openalex.org/W2606429533","https://openalex.org/W2620971809","https://openalex.org/W2730845691","https://openalex.org/W2766219058","https://openalex.org/W2808631503","https://openalex.org/W2891205112","https://openalex.org/W2897003492","https://openalex.org/W2903006902","https://openalex.org/W2914911817","https://openalex.org/W2915722758","https://openalex.org/W2964159205","https://openalex.org/W2990594533","https://openalex.org/W2990824810","https://openalex.org/W3011176162","https://openalex.org/W3035014310","https://openalex.org/W3035160371","https://openalex.org/W3036601975","https://openalex.org/W3149794337","https://openalex.org/W3150354321","https://openalex.org/W3152594694","https://openalex.org/W3158762648","https://openalex.org/W3206191467","https://openalex.org/W3207537403","https://openalex.org/W4214540501","https://openalex.org/W4221153521","https://openalex.org/W4221161145","https://openalex.org/W4226106344","https://openalex.org/W4285819380","https://openalex.org/W4297841641","https://openalex.org/W4379251869","https://openalex.org/W4387967920"],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W2390529913","https://openalex.org/W2142368101","https://openalex.org/W2372249404","https://openalex.org/W2367547137","https://openalex.org/W2354994102","https://openalex.org/W2387733758","https://openalex.org/W2376664795","https://openalex.org/W2028097510","https://openalex.org/W2505877856"],"abstract_inverted_index":{"Automatic":[0],"lyric":[1],"transcription":[2,14,46],"(ALT)":[3],"refers":[4,16],"to":[5,17,60,151,165,206],"transcribing":[6,18],"singing":[7,19,54,108],"voices":[8,20],"into":[9,21],"lyrics,":[10],"while":[11],"automatic":[12],"music":[13],"(AMT)":[15],"note":[22,50,130],"events,":[23],"i.e.,":[24],"musical":[25,67],"MIDI":[26],"notes.":[27,85],"Despite":[28],"these":[29,213],"two":[30],"tasks":[31],"having":[32],"significant":[33],"potential":[34],"for":[35,95,127],"practical":[36],"application,":[37],"they":[38],"are":[39],"still":[40],"nascent.":[41],"This":[42],"is":[43,56],"because":[44],"the":[45,61,75,81,105,142,153,170,200,207,216],"of":[47,63,73,77,83,120,155,203,218,226],"lyrics":[48,79,128],"and":[49,80,99,112,118,129,148,172,190,214],"events":[51],"solely":[52],"from":[53,141,169],"audio":[55,116,171],"notoriously":[57],"difficult":[58],"due":[59],"presence":[62],"noise":[64,228],"contamination,":[65],"e.g.,":[66],"accompaniment,":[68],"resulting":[69],"in":[70,224],"a":[71,92,161],"degradation":[72],"both":[74,188],"intelligibility":[76],"sung":[78,84],"recognizability":[82],"To":[86],"address":[87],"this":[88],"challenge,":[89],"we":[90,103,135,178,197,211],"propose":[91,136],"general":[93],"framework":[94],"implementing":[96],"multimodal":[97,107,208,221],"ALT":[98,189],"AMT":[100,191],"systems.":[101],"Additionally,":[102],"curate":[104],"first":[106],"dataset,":[109],"comprising":[110],"N20EMv1":[111],"N20EMv2,":[113],"which":[114],"encompasses":[115],"recordings":[117],"videos":[119],"lip":[121],"movements,":[122],"together":[123],"with":[124],"ground":[125],"truth":[126],"events.":[131],"For":[132],"model":[133],"construction,":[134],"adapting":[137],"self-supervised":[138],"learning":[139],"models":[140],"speech":[143],"domain":[144],"as":[145],"acoustic":[146],"encoders":[147,150],"visual":[149],"alleviate":[152],"scarcity":[154],"labeled":[156],"data.":[157],"We":[158],"also":[159,198],"introduce":[160],"residual":[162],"cross-attention":[163],"mechanism":[164],"effectively":[166],"integrate":[167],"features":[168],"video":[173],"modalities.":[174],"Through":[175],"extensive":[176],"experiments,":[177,196],"demonstrate":[179,215],"that":[180],"our":[181,219],"single-modal":[182,195],"systems":[183],"exhibit":[184],"state-of-the-art":[185],"performance":[186],"on":[187],"tasks.":[192],"Subsequently,":[193],"through":[194],"explore":[199],"individual":[201],"contributions":[202],"each":[204],"modality":[205],"system.":[209],"Finally,":[210],"combine":[212],"effectiveness":[217],"proposed":[220],"systems,":[222],"particularly":[223],"terms":[225],"their":[227],"robustness.":[229]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":1}],"updated_date":"2026-06-26T08:34:08.712188","created_date":"2025-10-10T00:00:00"}
