{"id":"https://openalex.org/W2964286535","doi":"https://doi.org/10.1109/icassp.2019.8683476","title":"Generative Moment Matching Network-based Random Modulation Post-filter for DNN-based Singing Voice Synthesis and Neural Double-tracking","display_name":"Generative Moment Matching Network-based Random Modulation Post-filter for DNN-based Singing Voice Synthesis and Neural Double-tracking","publication_year":2019,"publication_date":"2019-04-16","ids":{"openalex":"https://openalex.org/W2964286535","doi":"https://doi.org/10.1109/icassp.2019.8683476","mag":"2964286535"},"language":"en","primary_location":{"id":"doi:10.1109/icassp.2019.8683476","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2019.8683476","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5066687140","display_name":"Hiroki Tamaru","orcid":null},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Hiroki Tamaru","raw_affiliation_strings":["Graduate School of Information Science and Technology, The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Information Science and Technology, The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083394213","display_name":"Yuki Saito","orcid":"https://orcid.org/0000-0002-7967-2613"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yuki Saito","raw_affiliation_strings":["Graduate School of Information Science and Technology, The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Information Science and Technology, The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013050263","display_name":"Shinnosuke Takamichi","orcid":"https://orcid.org/0000-0003-0520-7847"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shinnosuke Takamichi","raw_affiliation_strings":["Graduate School of Information Science and Technology, The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Information Science and Technology, The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024181978","display_name":"Tomoki Koriyama","orcid":"https://orcid.org/0000-0002-8347-5604"},"institutions":[{"id":"https://openalex.org/I114531698","display_name":"Tokyo Institute of Technology","ror":"https://ror.org/0112mx960","country_code":"JP","type":"education","lineage":["https://openalex.org/I114531698"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Tomoki Koriyama","raw_affiliation_strings":["School of Engineering, Tokyo Institute of Technology, Japan"],"affiliations":[{"raw_affiliation_string":"School of Engineering, Tokyo Institute of Technology, Japan","institution_ids":["https://openalex.org/I114531698"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5003814223","display_name":"Hiroshi Saruwatari","orcid":"https://orcid.org/0000-0003-0876-5617"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Hiroshi Saruwatari","raw_affiliation_strings":["Graduate School of Information Science and Technology, The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Information Science and Technology, The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5066687140"],"corresponding_institution_ids":["https://openalex.org/I74801974"],"apc_list":null,"apc_paid":null,"fwci":1.161,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.78906346,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":"12","issue":null,"first_page":"7070","last_page":"7074"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.7167954444885254},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7140097618103027},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6989440321922302},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.596237301826477},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.5568251013755798},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5010175704956055},{"id":"https://openalex.org/keywords/pitch-contour","display_name":"Pitch contour","score":0.4592691659927368},{"id":"https://openalex.org/keywords/variation","display_name":"Variation (astronomy)","score":0.4465644955635071},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4390139579772949},{"id":"https://openalex.org/keywords/fundamental-frequency","display_name":"Fundamental frequency","score":0.4161880314350128},{"id":"https://openalex.org/keywords/moment","display_name":"Moment (physics)","score":0.4157376289367676},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3607054352760315},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.26210710406303406}],"concepts":[{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.7167954444885254},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7140097618103027},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6989440321922302},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.596237301826477},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.5568251013755798},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5010175704956055},{"id":"https://openalex.org/C2777895490","wikidata":"https://www.wikidata.org/wiki/Q7198848","display_name":"Pitch contour","level":2,"score":0.4592691659927368},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.4465644955635071},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4390139579772949},{"id":"https://openalex.org/C10513763","wikidata":"https://www.wikidata.org/wiki/Q1331774","display_name":"Fundamental frequency","level":2,"score":0.4161880314350128},{"id":"https://openalex.org/C179254644","wikidata":"https://www.wikidata.org/wiki/Q13222844","display_name":"Moment (physics)","level":2,"score":0.4157376289367676},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3607054352760315},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.26210710406303406},{"id":"https://openalex.org/C74650414","wikidata":"https://www.wikidata.org/wiki/Q11397","display_name":"Classical mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C44870925","wikidata":"https://www.wikidata.org/wiki/Q37547","display_name":"Astrophysics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp.2019.8683476","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2019.8683476","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2019 - 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5799999833106995}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":41,"referenced_works":["https://openalex.org/W29794711","https://openalex.org/W620379668","https://openalex.org/W1487641199","https://openalex.org/W1523372075","https://openalex.org/W1595391793","https://openalex.org/W1959608418","https://openalex.org/W2029434926","https://openalex.org/W2099471712","https://openalex.org/W2124097505","https://openalex.org/W2144902422","https://openalex.org/W2146502635","https://openalex.org/W2154920538","https://openalex.org/W2293049663","https://openalex.org/W2395578248","https://openalex.org/W2408435475","https://openalex.org/W2428952132","https://openalex.org/W2471520273","https://openalex.org/W2515336442","https://openalex.org/W2516321201","https://openalex.org/W2567070169","https://openalex.org/W2607404225","https://openalex.org/W2746654391","https://openalex.org/W2778460379","https://openalex.org/W2804998325","https://openalex.org/W2950292946","https://openalex.org/W2962916039","https://openalex.org/W2963398683","https://openalex.org/W2963970792","https://openalex.org/W4247639588","https://openalex.org/W4320013936","https://openalex.org/W6601203246","https://openalex.org/W6629354409","https://openalex.org/W6631309588","https://openalex.org/W6640963894","https://openalex.org/W6678318511","https://openalex.org/W6681302627","https://openalex.org/W6681435938","https://openalex.org/W6711777497","https://openalex.org/W6713876852","https://openalex.org/W6718237598","https://openalex.org/W6731370813"],"related_works":["https://openalex.org/W2064065729","https://openalex.org/W2616987818","https://openalex.org/W2108382268","https://openalex.org/W1890221585","https://openalex.org/W169038075","https://openalex.org/W2508035242","https://openalex.org/W2140156791","https://openalex.org/W91866709","https://openalex.org/W3093375612","https://openalex.org/W2127017784"],"abstract_inverted_index":{"This":[0],"paper":[1],"proposes":[2],"a":[3,28,34,43,60,96,116],"generative":[4],"moment":[5],"matching":[6],"network":[7,18],"(GMMN)-based":[8],"post-filter":[9],"that":[10,134,158],"provides":[11],"inter-utterance":[12,118,141],"pitch":[13,25,108,122,142],"variation":[14,26,101,119,143],"for":[15],"deep":[16],"neural":[17,161],"(DNN)-based":[19],"singing":[20,30,65,112,128],"voice":[21,31,129],"synthesis.":[22,130],"The":[23],"natural":[24,111,167],"of":[27,50,102,106,110],"human":[29],"leads":[32],"to":[33,58,98,120,152,166],"richer":[35],"musical":[36,88],"experience":[37],"and":[38,56,80,114,154],"is":[39,78,84,163],"used":[40],"in":[41,46],"double-tracking,":[42,153],"recording":[44],"method":[45],"which":[47],"two":[48],"performances":[49],"the":[51,75,100,103,107,121,155],"same":[52],"phrase":[53],"are":[54],"recorded":[55],"mixed":[57],"create":[59],"richer,":[61],"layered":[62],"sound.":[63],"However,":[64],"voices":[66,113],"synthesized":[67,85],"using":[68],"conventional":[69,126,170],"DNN-based":[70,127],"methods":[71],"never":[72],"vary":[73],"because":[74],"synthesis":[76],"process":[77],"deterministic":[79],"only":[81],"one":[82,87],"waveform":[83],"from":[86],"score.":[89],"To":[90],"address":[91],"this":[92],"problem,":[93],"we":[94],"use":[95],"GMMN":[97],"model":[99],"modulation":[104],"spectrum":[105],"contour":[109,123],"add":[115],"randomized":[117],"generated":[124],"by":[125],"Experimental":[131],"evaluations":[132],"suggest":[133],"1)":[135],"our":[136,150],"approach":[137,151],"can":[138],"provide":[139],"perceptible":[140],"while":[144],"preserving":[145],"speech":[146],"quality.":[147],"We":[148],"extend":[149],"evaluation":[156],"demonstrates":[157],"2)":[159],"GMMN-based":[160],"double-tracking":[162,168,174],"perceptually":[164],"closer":[165],"than":[169],"signal":[171],"processing-based":[172],"artificial":[173],"is.":[175]},"counts_by_year":[{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":2},{"year":2020,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
