{"id":"https://openalex.org/W3206191467","doi":"https://doi.org/10.1145/3503161.3547854","title":"SingGAN: Generative Adversarial Network For High-Fidelity Singing Voice Generation","display_name":"SingGAN: Generative Adversarial Network For High-Fidelity Singing Voice Generation","publication_year":2022,"publication_date":"2022-10-10","ids":{"openalex":"https://openalex.org/W3206191467","doi":"https://doi.org/10.1145/3503161.3547854","mag":"3206191467"},"language":"en","primary_location":{"id":"doi:10.1145/3503161.3547854","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3503161.3547854","pdf_url":null,"source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2110.07468","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5011787904","display_name":"Rongjie Huang","orcid":"https://orcid.org/0000-0002-1695-9000"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Rongjie Huang","raw_affiliation_strings":["Zhejiang University, HangZhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, HangZhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012657658","display_name":"Chenye Cui","orcid":"https://orcid.org/0009-0009-9083-1628"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chenye Cui","raw_affiliation_strings":["Zhejiang University, HangZhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, HangZhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060844215","display_name":"Feiyang Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"FeiYang cHEN","raw_affiliation_strings":["Huawei Cloud, HangZhou, China"],"affiliations":[{"raw_affiliation_string":"Huawei Cloud, HangZhou, China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088179161","display_name":"Yi Ren","orcid":"https://orcid.org/0000-0001-9889-5460"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi Ren","raw_affiliation_strings":["Zhejiang University, HangZhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, HangZhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065126806","display_name":"Jinglin Liu","orcid":"https://orcid.org/0000-0002-9905-3887"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinglin Liu","raw_affiliation_strings":["Zhejiang University, HangZhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, HangZhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079260216","display_name":"Zhou Zhao","orcid":"https://orcid.org/0000-0001-6121-0384"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhou Zhao","raw_affiliation_strings":["Zhejiang University, HangZhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, HangZhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025894594","display_name":"Baoxing Huai","orcid":null},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Baoxing Huai","raw_affiliation_strings":["Huawei Cloud, HangZhou, China"],"affiliations":[{"raw_affiliation_string":"Huawei Cloud, HangZhou, China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5065838678","display_name":"Zhefeng Wang","orcid":"https://orcid.org/0000-0001-6703-2064"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhefeng Wang","raw_affiliation_strings":["Huawei Cloud, HangZhou, China"],"affiliations":[{"raw_affiliation_string":"Huawei Cloud, HangZhou, China","institution_ids":["https://openalex.org/I2250955327"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5011787904"],"corresponding_institution_ids":["https://openalex.org/I76130692"],"apc_list":null,"apc_paid":null,"fwci":4.2891,"has_fulltext":false,"cited_by_count":43,"citation_normalized_percentile":{"value":0.9551068,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"2525","last_page":"2535"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.7759044170379639},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7715588808059692},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.7170349359512329},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6553667783737183},{"id":"https://openalex.org/keywords/high-fidelity","display_name":"High fidelity","score":0.5230023264884949},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5035542845726013},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.47200748324394226},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.44988399744033813},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.43743857741355896},{"id":"https://openalex.org/keywords/vocal-tract","display_name":"Vocal tract","score":0.41120070219039917},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3318902850151062},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.14394575357437134},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.1045772135257721}],"concepts":[{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.7759044170379639},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7715588808059692},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.7170349359512329},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6553667783737183},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.5230023264884949},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5035542845726013},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.47200748324394226},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.44988399744033813},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43743857741355896},{"id":"https://openalex.org/C47401133","wikidata":"https://www.wikidata.org/wiki/Q748953","display_name":"Vocal tract","level":2,"score":0.41120070219039917},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3318902850151062},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.14394575357437134},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.1045772135257721},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3503161.3547854","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3503161.3547854","pdf_url":null,"source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2110.07468","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2110.07468","pdf_url":"https://arxiv.org/pdf/2110.07468","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2110.07468","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2110.07468","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2110.07468","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2110.07468","pdf_url":"https://arxiv.org/pdf/2110.07468","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities","score":0.5600000023841858},{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.44999998807907104}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":58,"referenced_works":["https://openalex.org/W1552314771","https://openalex.org/W2067295501","https://openalex.org/W2107860279","https://openalex.org/W2160473997","https://openalex.org/W2519091744","https://openalex.org/W2788851830","https://openalex.org/W2903739847","https://openalex.org/W2946200149","https://openalex.org/W2949382160","https://openalex.org/W2963300588","https://openalex.org/W2964243274","https://openalex.org/W2970006822","https://openalex.org/W2970730223","https://openalex.org/W2970997853","https://openalex.org/W2971753973","https://openalex.org/W2990440871","https://openalex.org/W3015338123","https://openalex.org/W3015499232","https://openalex.org/W3015516707","https://openalex.org/W3019084079","https://openalex.org/W3021066808","https://openalex.org/W3026874504","https://openalex.org/W3033411150","https://openalex.org/W3045748506","https://openalex.org/W3046970875","https://openalex.org/W3048423403","https://openalex.org/W3081279708","https://openalex.org/W3082910224","https://openalex.org/W3092028330","https://openalex.org/W3097566756","https://openalex.org/W3100054454","https://openalex.org/W3101119695","https://openalex.org/W3103104054","https://openalex.org/W3123097577","https://openalex.org/W3128910262","https://openalex.org/W3129651364","https://openalex.org/W3130016944","https://openalex.org/W3133525064","https://openalex.org/W3144035034","https://openalex.org/W3158762648","https://openalex.org/W3162673269","https://openalex.org/W3167318608","https://openalex.org/W3168527213","https://openalex.org/W3169635929","https://openalex.org/W3172148458","https://openalex.org/W3205398360","https://openalex.org/W4214912006","https://openalex.org/W4224309908","https://openalex.org/W4280542470","https://openalex.org/W4281789500","https://openalex.org/W4285345683","https://openalex.org/W4287117308","https://openalex.org/W4287184558","https://openalex.org/W4287672314","https://openalex.org/W4289361892","https://openalex.org/W4294619240","https://openalex.org/W4298580827","https://openalex.org/W4303519914"],"related_works":["https://openalex.org/W2530685530","https://openalex.org/W4375868962","https://openalex.org/W2011227383","https://openalex.org/W2088854863","https://openalex.org/W4402568167","https://openalex.org/W2020989338","https://openalex.org/W1823617068","https://openalex.org/W2147126679","https://openalex.org/W4300049944","https://openalex.org/W2115039802"],"abstract_inverted_index":{"Deep":[0],"generative":[1,63],"models":[2],"have":[3],"achieved":[4],"significant":[5],"progress":[6],"in":[7,50,79],"speech":[8],"synthesis":[9,16,46,208],"to":[10,43,74,93,116,196,215],"date,":[11],"while":[12],"high-fidelity":[13,68,155],"singing":[14,44,69,156,206,222],"voice":[15,45,70,157,207],"is":[17,149],"still":[18],"an":[19],"open":[20],"problem":[21,78],"for":[22,37,67],"its":[23],"long":[24,101],"continuous":[25,102],"pronunciation,":[26],"rich":[27],"high-frequency":[28,54,122],"parts,":[29],"and":[30,52,99,105,110,120,124,136,203],"strong":[31],"expressiveness.":[32],"Existing":[33],"neural":[34],"vocoders":[35],"designed":[36,66,153],"text-to-speech":[38],"cannot":[39],"directly":[40],"be":[41],"applied":[42],"because":[47],"they":[48],"result":[49],"glitches":[51],"poor":[53],"reconstruction.":[55],"In":[56],"this":[57],"work,":[58],"we":[59,83],"propose":[60,84],"SingGAN,":[61],"a":[62,175,184,212],"adversarial":[64],"network":[65],"synthesis.":[71],"Specifically,":[72],"1)":[73],"alleviate":[75],"the":[76,80,88,95,128,143,150,164,197,204,217],"glitch":[77],"generated":[81],"samples,":[82],"source":[85],"excitation":[86],"with":[87,167],"adaptive":[89],"feature":[90,138],"learning":[91],"filters":[92],"expand":[94],"receptive":[96],"field":[97],"patterns":[98],"stabilize":[100],"signal":[103],"generation;":[104],"2)":[106],"SingGAN":[107,131,148,162,173,193],"introduces":[108],"global":[109],"local":[111],"discriminators":[112],"at":[113],"different":[114],"scales":[115],"enrich":[117],"low-frequency":[118],"details":[119],"promote":[121],"reconstruction;":[123],"3)":[125],"To":[126,142],"improve":[127],"training":[129],"efficiency,":[130],"includes":[132],"auxiliary":[133],"spectrogram":[134],"losses":[135],"sub-band":[137],"matching":[139],"penalty":[140],"loss.":[141],"best":[144],"of":[145,161,178,200],"our":[146],"knowledge,":[147],"first":[151],"work":[152],"toward":[154],"vocoding.":[158],"Our":[159],"evaluation":[160],"demonstrates":[163],"state-of-the-art":[165],"results":[166],"higher-quality":[168],"(MOS":[169],"4.05)":[170],"samples.":[171],"Also,":[172],"enables":[174],"sample":[176],"speed":[177],"50x":[179],"faster":[180],"than":[181],"real-time":[182],"on":[183],"single":[185],"NVIDIA":[186],"2080Ti":[187],"GPU.":[188],"We":[189],"further":[190],"show":[191],"that":[192],"generalizes":[194],"well":[195],"mel-spectrogram":[198],"inversion":[199],"unseen":[201],"singers,":[202],"end-to-end":[205],"system":[209],"SingGAN-SVS":[210],"enjoys":[211],"two-stage":[213],"pipeline":[214],"transform":[216],"music":[218],"scores":[219],"into":[220],"expressive":[221],"voices.":[223]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":13},{"year":2023,"cited_by_count":17},{"year":2022,"cited_by_count":5}],"updated_date":"2026-03-27T14:29:43.386196","created_date":"2021-10-25T00:00:00"}
