{"id":"https://openalex.org/W4405709542","doi":"https://doi.org/10.1109/iscslp63861.2024.10800419","title":"COMOSVC: Consistency Model-Based Singing Voice Conversion","display_name":"COMOSVC: Consistency Model-Based Singing Voice Conversion","publication_year":2024,"publication_date":"2024-11-07","ids":{"openalex":"https://openalex.org/W4405709542","doi":"https://doi.org/10.1109/iscslp63861.2024.10800419"},"language":"en","primary_location":{"id":"doi:10.1109/iscslp63861.2024.10800419","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iscslp63861.2024.10800419","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113190721","display_name":"Yiwen Lu","orcid":null},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Yiwen Lu","raw_affiliation_strings":["Hong Kong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Hong Kong University of Science and Technology","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019978970","display_name":"Zhen Ye","orcid":"https://orcid.org/0009-0003-6932-9859"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Zhen Ye","raw_affiliation_strings":["Hong Kong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Hong Kong University of Science and Technology","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100652844","display_name":"Wei Xue","orcid":"https://orcid.org/0000-0002-4942-7748"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Wei Xue","raw_affiliation_strings":["Hong Kong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Hong Kong University of Science and Technology","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101522530","display_name":"Xu Tan","orcid":"https://orcid.org/0000-0001-5631-0639"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xu Tan","raw_affiliation_strings":["Microsoft Research Asia"],"affiliations":[{"raw_affiliation_string":"Microsoft Research Asia","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061082180","display_name":"Qifeng Liu","orcid":"https://orcid.org/0000-0001-6191-076X"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Qifeng Liu","raw_affiliation_strings":["Hong Kong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Hong Kong University of Science and Technology","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5045081171","display_name":"Yike Guo","orcid":"https://orcid.org/0000-0002-3075-2161"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yike Guo","raw_affiliation_strings":["Hong Kong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Hong Kong University of Science and Technology","institution_ids":["https://openalex.org/I200769079"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5113190721"],"corresponding_institution_ids":["https://openalex.org/I200769079"],"apc_list":null,"apc_paid":null,"fwci":2.0119,"has_fulltext":false,"cited_by_count":6,"citation_normalized_percentile":{"value":0.89085401,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"184","last_page":"188"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9944999814033508,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9944999814033508,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9933000206947327,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9559999704360962,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.7696410417556763},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6982657313346863},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.655450165271759},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5132253170013428},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.20338314771652222},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.15421846508979797}],"concepts":[{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.7696410417556763},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6982657313346863},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.655450165271759},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5132253170013428},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.20338314771652222},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.15421846508979797},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/iscslp63861.2024.10800419","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iscslp63861.2024.10800419","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 14th International Symposium on Chinese Spoken Language Processing (ISCSLP)","raw_type":"proceedings-article"},{"id":"pmh:oai:repository.hkust.edu.hk:1783.1-148794","is_oa":false,"landing_page_url":"http://repository.hkust.edu.hk/ir/Record/1783.1-148794","pdf_url":null,"source":{"id":"https://openalex.org/S4306401796","display_name":"Rare & Special e-Zone (The Hong Kong University of Science and Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I200769079","host_organization_name":"Hong Kong University of Science and Technology","host_organization_lineage":["https://openalex.org/I200769079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Conference paper"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W1552314771","https://openalex.org/W2294038178","https://openalex.org/W2401296648","https://openalex.org/W2963103134","https://openalex.org/W2972812066","https://openalex.org/W3095948607","https://openalex.org/W3170751106","https://openalex.org/W4226320669","https://openalex.org/W4285345683","https://openalex.org/W4387968164","https://openalex.org/W6633684334","https://openalex.org/W6786375611","https://openalex.org/W6838452192","https://openalex.org/W6839738141","https://openalex.org/W6847363464","https://openalex.org/W6850614898","https://openalex.org/W6857514821","https://openalex.org/W7034042604"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390529913","https://openalex.org/W2142368101","https://openalex.org/W2372249404","https://openalex.org/W2367547137","https://openalex.org/W2354994102","https://openalex.org/W2387733758","https://openalex.org/W2376664795"],"abstract_inverted_index":{"The":[0],"diffusion-based":[1,58,102],"Singing":[2],"Voice":[3],"Conversion":[4],"(SVC)":[5],"methods":[6],"have":[7],"achieved":[8],"remarkable":[9],"performances,":[10],"producing":[11],"natural":[12],"audios":[13],"with":[14],"high":[15],"similarity":[16],"to":[17,49,77],"the":[18,22,99],"target":[19],"timbre.":[20],"However,":[21],"iterative":[23],"sampling":[24],"process":[25],"results":[26],"in":[27],"slow":[28],"inference":[29,96],"speed,":[30],"and":[31,54,67,117],"acceleration":[32],"thus":[33],"becomes":[34],"crucial.":[35],"In":[36],"this":[37],"paper,":[38],"we":[39],"propose":[40],"CoMoSVC,":[41],"a":[42,68,83,93],"consistency":[43],"model-based":[44],"SVC":[45,103],"method,":[46],"which":[47],"aims":[48],"achieve":[50,78],"both":[51,115],"high-quality":[52],"generation":[53],"high-speed":[55],"sampling.":[56,80],"A":[57],"teacher":[59],"model":[60,70],"is":[61,71],"first":[62],"specially":[63],"designed":[64],"for":[65],"SVC,":[66],"student":[69],"further":[72],"distilled":[73],"under":[74],"self-consistency":[75],"properties":[76],"one-step":[79],"Experiments":[81],"on":[82,114],"single":[84],"NVIDIA":[85],"RTX4090":[86],"GPU":[87],"reveal":[88],"that":[89],"although":[90],"CoMoSVC":[91],"has":[92],"significantly":[94],"faster":[95],"speed":[97],"than":[98],"state-of-the-art":[100],"(SOTA)":[101],"systems,":[104],"it":[105],"still":[106],"achieves":[107],"comparable":[108],"or":[109],"superior":[110],"conversion":[111],"performance":[112],"based":[113],"subjective":[116],"objective":[118],"metrics.":[119],"Audio":[120],"samples":[121],"are":[122],"available":[123],"at":[124],"https://comosvc.github.io/.":[125]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":2}],"updated_date":"2026-04-19T08:26:33.389920","created_date":"2025-10-10T00:00:00"}
