{"id":"https://openalex.org/W4406461613","doi":"https://doi.org/10.1109/slt61566.2024.10832285","title":"Instructsing: High-Fidelity Singing Voice Generation Via Instructing Yourself","display_name":"Instructsing: High-Fidelity Singing Voice Generation Via Instructing Yourself","publication_year":2024,"publication_date":"2024-12-02","ids":{"openalex":"https://openalex.org/W4406461613","doi":"https://doi.org/10.1109/slt61566.2024.10832285"},"language":"en","primary_location":{"id":"doi:10.1109/slt61566.2024.10832285","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832285","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100654128","display_name":"Chang Zeng","orcid":"https://orcid.org/0000-0002-4882-1823"},"institutions":[{"id":"https://openalex.org/I184597095","display_name":"National Institute of Informatics","ror":"https://ror.org/04ksd4g47","country_code":"JP","type":"facility","lineage":["https://openalex.org/I1319490839","https://openalex.org/I184597095","https://openalex.org/I4210158934"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Chang Zeng","raw_affiliation_strings":["National Insitute of Informatics,Japan"],"affiliations":[{"raw_affiliation_string":"National Insitute of Informatics,Japan","institution_ids":["https://openalex.org/I184597095"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100376538","display_name":"Chunhui Wang","orcid":"https://orcid.org/0000-0002-7151-483X"},"institutions":[{"id":"https://openalex.org/I4210153393","display_name":"Geely (China)","ror":"https://ror.org/0446d5v35","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210153393"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chunhui Wang","raw_affiliation_strings":["Geely,China"],"affiliations":[{"raw_affiliation_string":"Geely,China","institution_ids":["https://openalex.org/I4210153393"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102018159","display_name":"Xiaoxiao Miao","orcid":"https://orcid.org/0000-0002-6645-6524"},"institutions":[{"id":"https://openalex.org/I168639165","display_name":"Singapore Institute of Technology","ror":"https://ror.org/01v2c2791","country_code":"SG","type":"education","lineage":["https://openalex.org/I168639165"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Xiaoxiao Miao","raw_affiliation_strings":["Singapore Institute of Technology,Singapore"],"affiliations":[{"raw_affiliation_string":"Singapore Institute of Technology,Singapore","institution_ids":["https://openalex.org/I168639165"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044520508","display_name":"Jian Zhao","orcid":"https://orcid.org/0000-0002-1306-1676"},"institutions":[{"id":"https://openalex.org/I4210153393","display_name":"Geely (China)","ror":"https://ror.org/0446d5v35","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210153393"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jian Zhao","raw_affiliation_strings":["Geely,China"],"affiliations":[{"raw_affiliation_string":"Geely,China","institution_ids":["https://openalex.org/I4210153393"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003114828","display_name":"Zhonglin Jiang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210153393","display_name":"Geely (China)","ror":"https://ror.org/0446d5v35","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210153393"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhonglin Jiang","raw_affiliation_strings":["Geely,China"],"affiliations":[{"raw_affiliation_string":"Geely,China","institution_ids":["https://openalex.org/I4210153393"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5035121805","display_name":"Yong Chen","orcid":"https://orcid.org/0000-0003-1691-9460"},"institutions":[{"id":"https://openalex.org/I4210153393","display_name":"Geely (China)","ror":"https://ror.org/0446d5v35","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210153393"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yong Chen","raw_affiliation_strings":["Geely,China"],"affiliations":[{"raw_affiliation_string":"Geely,China","institution_ids":["https://openalex.org/I4210153393"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100654128"],"corresponding_institution_ids":["https://openalex.org/I184597095"],"apc_list":null,"apc_paid":null,"fwci":0.3735,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.62167372,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"675","last_page":"681"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9936000108718872,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.7999343872070312},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7206125259399414},{"id":"https://openalex.org/keywords/high-fidelity","display_name":"High fidelity","score":0.6265533566474915},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.6038773059844971},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5592086911201477},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.34592434763908386},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.15920987725257874},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.15700426697731018},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.13066482543945312},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.07263550162315369}],"concepts":[{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.7999343872070312},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7206125259399414},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.6265533566474915},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.6038773059844971},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5592086911201477},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.34592434763908386},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.15920987725257874},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.15700426697731018},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.13066482543945312},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.07263550162315369},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/slt61566.2024.10832285","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832285","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320311687","display_name":"Ministry of Education","ror":"https://ror.org/03m01yf64"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":45,"referenced_works":["https://openalex.org/W1583837637","https://openalex.org/W2088432713","https://openalex.org/W2593414223","https://openalex.org/W2603947633","https://openalex.org/W2946200149","https://openalex.org/W2963091184","https://openalex.org/W2963175743","https://openalex.org/W2970006822","https://openalex.org/W2985693847","https://openalex.org/W2990440871","https://openalex.org/W3015338123","https://openalex.org/W3082910224","https://openalex.org/W3092028330","https://openalex.org/W3097514409","https://openalex.org/W3097538987","https://openalex.org/W3144035034","https://openalex.org/W3158762648","https://openalex.org/W3197273793","https://openalex.org/W3197385343","https://openalex.org/W3206191467","https://openalex.org/W4280575909","https://openalex.org/W4283021642","https://openalex.org/W4283455388","https://openalex.org/W4285345683","https://openalex.org/W4296068763","https://openalex.org/W4296068811","https://openalex.org/W4297841449","https://openalex.org/W4307323391","https://openalex.org/W4372260108","https://openalex.org/W4372262501","https://openalex.org/W4375869170","https://openalex.org/W4375869299","https://openalex.org/W4385823074","https://openalex.org/W4391021559","https://openalex.org/W4400381851","https://openalex.org/W4405182531","https://openalex.org/W6639824700","https://openalex.org/W6640212811","https://openalex.org/W6757817989","https://openalex.org/W6763832098","https://openalex.org/W6767111847","https://openalex.org/W6771763809","https://openalex.org/W6783382068","https://openalex.org/W6783867762","https://openalex.org/W6849109464"],"related_works":["https://openalex.org/W4313443006","https://openalex.org/W2945374968","https://openalex.org/W4293777179","https://openalex.org/W4385452045","https://openalex.org/W2164070813","https://openalex.org/W2135608140","https://openalex.org/W2895525995","https://openalex.org/W2332512904","https://openalex.org/W4224231624","https://openalex.org/W2319626700"],"abstract_inverted_index":{"It":[0,52],"is":[1,80],"challenging":[2],"to":[3,67,98,111,124,157,180],"accelerate":[4],"the":[5,60,77,92,95,112,115,120,166,188],"training":[6,167],"process":[7],"while":[8,39],"ensuring":[9],"both":[10],"high-quality":[11],"generated":[12],"voices":[13],"and":[14,49,56,106,184],"acceptable":[15],"inference":[16],"speed.":[17],"In":[18,109,131],"this":[19],"paper,":[20],"we":[21,135],"propose":[22],"a":[23,63,99,137,146,170],"novel":[24],"neural":[25,37,159],"vocoder":[26],"called":[27],"InstructSing,":[28],"which":[29,90],"can":[30],"converge":[31],"much":[32],"faster":[33],"compared":[34],"with":[35,82,145,162],"other":[36,158],"vocoders":[38,160],"maintaining":[40],"good":[41],"performance":[42],"by":[43,86],"integrating":[44],"differentiable":[45],"digital":[46],"signal":[47],"processing":[48],"adversarial":[50],"training.":[51],"includes":[53],"one":[54],"generator":[55,61],"two":[57],"discriminators.":[58],"Specifically,":[59],"incorporates":[62],"harmonic-plus-noise":[64],"(HN)":[65],"module":[66,79,97],"produce":[68],"8":[69],"kHz":[70,127],"audio":[71],"as":[72,122,140],"an":[73,83,87],"instructive":[74],"signal.":[75],"Subsequently,":[76],"HN":[78,96],"connected":[81],"extended":[84,116],"WaveNet":[85,117],"UNet-based":[88],"module,":[89],"transforms":[91],"output":[93],"of":[94,133,165],"latent":[100,113],"variable":[101],"sequence":[102],"containing":[103],"essential":[104],"periodic":[105],"aperiodic":[107],"information.":[108],"addition":[110],"sequence,":[114],"also":[118],"takes":[119],"melspectrogram":[121],"input":[123],"generate":[125],"48":[126],"high-fidelity":[128],"singing":[129],"voices.":[130],"terms":[132],"discriminators,":[134],"combine":[136],"multi-period":[138],"discriminator,":[139],"originally":[141],"proposed":[142],"in":[143],"HiFiGAN,":[144],"multi-resolution":[147],"multiband":[148],"STFT":[149],"discriminator.":[150],"Notably,":[151],"InstructSing":[152],"achieves":[153],"comparable":[154],"voice":[155],"quality":[156],"but":[161],"only":[163],"one-tenth":[164],"steps":[168],"on":[169],"4":[171],"NVIDIA":[172],"V100":[173],"GPU":[174],"machine<sup":[175],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[176,192],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>.":[177],"We":[178],"plan":[179],"open-source":[181],"our":[182],"code":[183],"pretrained":[185],"model":[186],"once":[187],"paper":[189],"get":[190],"accepted.<sup":[191],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>Demo":[193],"page:":[194],"https://wavelandspeech.github.io/instructsing/":[195]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
