{"id":"https://openalex.org/W4416251437","doi":"https://doi.org/10.1109/ijcnn64981.2025.11229144","title":"Fine-grained Voice Customization via Intensity Guided Latent Space Exploration","display_name":"Fine-grained Voice Customization via Intensity Guided Latent Space Exploration","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4416251437","doi":"https://doi.org/10.1109/ijcnn64981.2025.11229144"},"language":null,"primary_location":{"id":"doi:10.1109/ijcnn64981.2025.11229144","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn64981.2025.11229144","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5056544317","display_name":"Kwan-yeung Wong","orcid":"https://orcid.org/0000-0002-8700-5781"},"institutions":[{"id":"https://openalex.org/I14243506","display_name":"Hong Kong Polytechnic University","ror":"https://ror.org/0030zas98","country_code":"HK","type":"education","lineage":["https://openalex.org/I14243506"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Kwan-yeung Wong","raw_affiliation_strings":["The Hong Kong Polytechnic University,Department of Data Science &#x0026; Artificial Intelligence,Hong Kong,China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong Polytechnic University,Department of Data Science &#x0026; Artificial Intelligence,Hong Kong,China","institution_ids":["https://openalex.org/I14243506"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043016512","display_name":"Fu-Lai Chung","orcid":"https://orcid.org/0000-0001-5294-8168"},"institutions":[{"id":"https://openalex.org/I14243506","display_name":"Hong Kong Polytechnic University","ror":"https://ror.org/0030zas98","country_code":"HK","type":"education","lineage":["https://openalex.org/I14243506"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Fu-lai Chung","raw_affiliation_strings":["The Hong Kong Polytechnic University,Department of Data Science &#x0026; Artificial Intelligence,Hong Kong,China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong Polytechnic University,Department of Data Science &#x0026; Artificial Intelligence,Hong Kong,China","institution_ids":["https://openalex.org/I14243506"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5056544317"],"corresponding_institution_ids":["https://openalex.org/I14243506"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.19469923,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.6362000107765198,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.6362000107765198,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.07159999758005142,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12128","display_name":"AI in Service Interactions","score":0.04639999940991402,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.6539000272750854},{"id":"https://openalex.org/keywords/personalization","display_name":"Personalization","score":0.6157000064849854},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5676000118255615},{"id":"https://openalex.org/keywords/controllability","display_name":"Controllability","score":0.5130000114440918},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4779999852180481},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.4108999967575073},{"id":"https://openalex.org/keywords/human-voice","display_name":"Human voice","score":0.3824000060558319},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.37310001254081726}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7541000247001648},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.6539000272750854},{"id":"https://openalex.org/C183003079","wikidata":"https://www.wikidata.org/wiki/Q1000371","display_name":"Personalization","level":2,"score":0.6157000064849854},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5676000118255615},{"id":"https://openalex.org/C48209547","wikidata":"https://www.wikidata.org/wiki/Q1331104","display_name":"Controllability","level":2,"score":0.5130000114440918},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4779999852180481},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.42289999127388},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.4108999967575073},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3953000009059906},{"id":"https://openalex.org/C20766975","wikidata":"https://www.wikidata.org/wiki/Q7390","display_name":"Human voice","level":2,"score":0.3824000060558319},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.37310001254081726},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.35989999771118164},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.3490000069141388},{"id":"https://openalex.org/C115051666","wikidata":"https://www.wikidata.org/wiki/Q6522493","display_name":"Ranging","level":2,"score":0.33959999680519104},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.33329999446868896},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.31859999895095825},{"id":"https://openalex.org/C178718744","wikidata":"https://www.wikidata.org/wiki/Q2350070","display_name":"Voice command device","level":2,"score":0.3046000003814697},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2953999936580658},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.2872999906539917},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.2750000059604645},{"id":"https://openalex.org/C2780945871","wikidata":"https://www.wikidata.org/wiki/Q194274","display_name":"Backup","level":2,"score":0.2711000144481659},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.2703000009059906},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26159998774528503},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C2780440489","wikidata":"https://www.wikidata.org/wiki/Q5227278","display_name":"Data-driven","level":2,"score":0.25540000200271606},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ijcnn64981.2025.11229144","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn64981.2025.11229144","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W1552314771","https://openalex.org/W1972134721","https://openalex.org/W2294130536","https://openalex.org/W2396592486","https://openalex.org/W2726515241","https://openalex.org/W2896893982","https://openalex.org/W2940544976","https://openalex.org/W2982865155","https://openalex.org/W3007833402","https://openalex.org/W3097206152","https://openalex.org/W3159367349","https://openalex.org/W3163573274","https://openalex.org/W4224929761","https://openalex.org/W4234123135","https://openalex.org/W4297841819","https://openalex.org/W4313196646","https://openalex.org/W4313316128","https://openalex.org/W4385820037","https://openalex.org/W4389886629","https://openalex.org/W4392908903","https://openalex.org/W4412945298"],"related_works":[],"abstract_inverted_index":{"The":[0],"synthesis":[1],"of":[2,11,99],"audible":[3],"dialogue":[4],"proves":[5],"valuable":[6],"in":[7,73,127],"a":[8,31,50,92],"wide":[9],"range":[10],"applications,":[12],"ranging":[13],"from":[14,71,115],"assistive":[15],"technologies":[16],"to":[17,34,139,164],"immersive":[18],"entertainment.":[19],"However,":[20],"the":[21,35,76,97,121,148,154],"need":[22,122],"for":[23,39,87,123,156],"varied":[24],"and":[25,159],"expressive":[26],"synthetic":[27,101,161],"voices":[28,102,162],"often":[29,69],"presents":[30,91],"bottleneck":[32],"due":[33],"significant":[36],"resources":[37,126],"required":[38],"professional":[40],"voice":[41,60,128,149],"recording.":[42],"While":[43],"text-to-speech":[44],"(TTS)":[45],"systems":[46],"have":[47],"emerged":[48],"as":[49],"promising":[51],"solution,":[52],"traditional":[53],"models":[54,142],"lack":[55],"fine-grained":[56],"control":[57],"over":[58,147],"specific":[59,165],"attributes.":[61,106],"Though":[62],"approaches":[63],"offering":[64],"attribute":[65,77],"manipulation":[66],"exist,":[67],"they":[68],"suffer":[70],"limitations":[72],"precisely":[74],"controlling":[75],"intensity":[78],"levels,":[79],"or":[80],"rely":[81],"on":[82],"expensive":[83],"human":[84,125],"annotation":[85],"processes":[86],"training.":[88],"This":[89,152],"paper":[90],"novel":[93],"TTS":[94,141],"framework":[95],"enabling":[96],"generation":[98,150],"new":[100],"with":[103],"controllable":[104],"high-level":[105],"Our":[107],"method":[108],"employs":[109],"latent":[110],"space":[111],"exploration,":[112],"leveraging":[113],"insights":[114],"various":[116],"speech":[117],"analysis":[118],"models,":[119],"circumventing":[120],"additional":[124],"creation.":[129],"Experimental":[130],"results":[131],"demonstrate":[132],"that":[133],"our":[134],"approach":[135],"maintains":[136],"naturalness":[137],"comparable":[138],"baseline":[140],"while":[143],"providing":[144],"enhanced":[145],"controllability":[146],"process.":[151],"paves":[153],"way":[155],"more":[157],"versatile":[158],"diverse":[160],"tailored":[163],"application":[166],"needs.":[167]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-14T00:00:00"}
