{"id":"https://openalex.org/W4391021729","doi":"https://doi.org/10.1109/asru57964.2023.10389711","title":"Zero-Shot Singing Voice Synthesis from Musical Score","display_name":"Zero-Shot Singing Voice Synthesis from Musical Score","publication_year":2023,"publication_date":"2023-12-16","ids":{"openalex":"https://openalex.org/W4391021729","doi":"https://doi.org/10.1109/asru57964.2023.10389711"},"language":"en","primary_location":{"id":"doi:10.1109/asru57964.2023.10389711","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru57964.2023.10389711","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5016862479","display_name":"Jun-You Wang","orcid":"https://orcid.org/0000-0002-9119-9259"},"institutions":[{"id":"https://openalex.org/I16733864","display_name":"National Taiwan University","ror":"https://ror.org/05bqach95","country_code":"TW","type":"education","lineage":["https://openalex.org/I16733864"]}],"countries":["TW"],"is_corresponding":true,"raw_author_name":"Jun-You Wang","raw_affiliation_strings":["National Taiwan University,Taiwan","National Taiwan University, Taiwan"],"affiliations":[{"raw_affiliation_string":"National Taiwan University,Taiwan","institution_ids":["https://openalex.org/I16733864"]},{"raw_affiliation_string":"National Taiwan University, Taiwan","institution_ids":["https://openalex.org/I16733864"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040508737","display_name":"Hung-yi Lee","orcid":"https://orcid.org/0000-0002-9654-5747"},"institutions":[{"id":"https://openalex.org/I16733864","display_name":"National Taiwan University","ror":"https://ror.org/05bqach95","country_code":"TW","type":"education","lineage":["https://openalex.org/I16733864"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Hung-Yi Lee","raw_affiliation_strings":["National Taiwan University,Taiwan","National Taiwan University, Taiwan"],"affiliations":[{"raw_affiliation_string":"National Taiwan University,Taiwan","institution_ids":["https://openalex.org/I16733864"]},{"raw_affiliation_string":"National Taiwan University, Taiwan","institution_ids":["https://openalex.org/I16733864"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073451247","display_name":"Jyh\u2010Shing Roger Jang","orcid":"https://orcid.org/0000-0002-7319-9095"},"institutions":[{"id":"https://openalex.org/I16733864","display_name":"National Taiwan University","ror":"https://ror.org/05bqach95","country_code":"TW","type":"education","lineage":["https://openalex.org/I16733864"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Jyh-Shing Roger Jang","raw_affiliation_strings":["National Taiwan University,Taiwan","National Taiwan University, Taiwan"],"affiliations":[{"raw_affiliation_string":"National Taiwan University,Taiwan","institution_ids":["https://openalex.org/I16733864"]},{"raw_affiliation_string":"National Taiwan University, Taiwan","institution_ids":["https://openalex.org/I16733864"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5038264558","display_name":"Li Su","orcid":"https://orcid.org/0000-0003-4275-8832"},"institutions":[{"id":"https://openalex.org/I4210098366","display_name":"Institute of Information Science, Academia Sinica","ror":"https://ror.org/00z83z196","country_code":"TW","type":"facility","lineage":["https://openalex.org/I4210098366","https://openalex.org/I84653119"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Li Su","raw_affiliation_strings":["Institute of Information Science,Academia Sinica,Taiwan","Academia Sinica, Institute of Information Science, Taiwan"],"affiliations":[{"raw_affiliation_string":"Institute of Information Science,Academia Sinica,Taiwan","institution_ids":["https://openalex.org/I4210098366"]},{"raw_affiliation_string":"Academia Sinica, Institute of Information Science, Taiwan","institution_ids":["https://openalex.org/I4210098366"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5016862479"],"corresponding_institution_ids":["https://openalex.org/I16733864"],"apc_list":null,"apc_paid":null,"fwci":0.5237,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.73179845,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.8433703780174255},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7225273847579956},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7137582898139954},{"id":"https://openalex.org/keywords/lyrics","display_name":"Lyrics","score":0.668329119682312},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6482862830162048},{"id":"https://openalex.org/keywords/musical","display_name":"Musical","score":0.647621750831604},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.527451753616333},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.48632651567459106},{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.4692053198814392},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.46615469455718994},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.37694671750068665},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.1468563973903656},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.1059822142124176},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.09894147515296936}],"concepts":[{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.8433703780174255},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7225273847579956},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7137582898139954},{"id":"https://openalex.org/C2776436406","wikidata":"https://www.wikidata.org/wiki/Q602446","display_name":"Lyrics","level":2,"score":0.668329119682312},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6482862830162048},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.647621750831604},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.527451753616333},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.48632651567459106},{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.4692053198814392},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.46615469455718994},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37694671750068665},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.1468563973903656},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.1059822142124176},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.09894147515296936},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0},{"id":"https://openalex.org/C111368507","wikidata":"https://www.wikidata.org/wiki/Q43518","display_name":"Oceanography","level":1,"score":0.0},{"id":"https://openalex.org/C153349607","wikidata":"https://www.wikidata.org/wiki/Q36649","display_name":"Visual arts","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru57964.2023.10389711","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru57964.2023.10389711","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W2102870814","https://openalex.org/W2127141656","https://openalex.org/W2293640355","https://openalex.org/W2560674852","https://openalex.org/W2747329762","https://openalex.org/W2747874407","https://openalex.org/W2778460379","https://openalex.org/W2921576841","https://openalex.org/W2962866891","https://openalex.org/W2964243274","https://openalex.org/W2984106626","https://openalex.org/W3013020904","https://openalex.org/W3015338123","https://openalex.org/W3016250102","https://openalex.org/W3081279708","https://openalex.org/W3097514409","https://openalex.org/W3133525064","https://openalex.org/W3161627112","https://openalex.org/W3163345527","https://openalex.org/W3207038517","https://openalex.org/W3207340675","https://openalex.org/W4221155333","https://openalex.org/W4235670393","https://openalex.org/W4285345683","https://openalex.org/W4298310324","https://openalex.org/W4366460484","https://openalex.org/W6763832098","https://openalex.org/W6766320909","https://openalex.org/W6774314701","https://openalex.org/W6778823374","https://openalex.org/W6790318877","https://openalex.org/W6790830454","https://openalex.org/W6795807602","https://openalex.org/W6809623261","https://openalex.org/W6847430118","https://openalex.org/W6851724922"],"related_works":["https://openalex.org/W2360952181","https://openalex.org/W4310670065","https://openalex.org/W634160686","https://openalex.org/W2597614303","https://openalex.org/W3214861561","https://openalex.org/W2389838651","https://openalex.org/W437317580","https://openalex.org/W2352995288","https://openalex.org/W2378183644","https://openalex.org/W2287414930"],"abstract_inverted_index":{"Zero-shot":[0],"singing":[1,10],"voice":[2,11],"synthesis":[3],"(SVS),":[4],"the":[5,9,22,45,49,70,93,100,108,112,124],"task":[6],"to":[7,107,115],"synthesize":[8],"of":[12,99],"an":[13,80,105],"arbitrary":[14],"target":[15],"singer,":[16],"has":[17],"gained":[18],"increasing":[19],"attentions":[20],"in":[21,131],"past":[23],"few":[24],"years.":[25],"Several":[26],"recently":[27],"proposed":[28,125],"systems":[29,39],"have":[30],"demonstrated":[31],"promising":[32],"results":[33],"on":[34],"this":[35,55],"task.":[36],"However,":[37],"these":[38],"require":[40],"detailed":[41],"musical":[42,50,67,71,109],"features":[43,86],"at":[44],"frame":[46],"level":[47],"as":[48,69,104],"content.":[51],"To":[52,74],"deal":[53],"with":[54,65,92],"issue,":[56],"we":[57,78],"propose":[58],"a":[59],"model":[60,76,114],"that":[61,83,123],"performs":[62],"zero-shot":[63],"SVS":[64,113],"only":[66],"score":[68],"content":[72],"condition.":[73],"help":[75],"training,":[77],"build":[79],"acoustic":[81,101],"encoder":[82,102],"extracts":[84],"linguistic":[85],"from":[87,117],"audio,":[88],"and":[89,134],"train":[90],"it":[91],"lyrics":[94],"transcription":[95],"objective.":[96],"The":[97],"output":[98],"serves":[103],"alternative":[106],"score,":[110],"allowing":[111],"learn":[116],"weakly":[118],"labeled":[119],"data.":[120],"Results":[121],"suggest":[122],"method":[126,130],"outperforms":[127],"baseline":[128],"semi-supervised":[129],"both":[132],"subjective":[133],"objective":[135],"tests.":[136]},"counts_by_year":[{"year":2025,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
