{"id":"https://openalex.org/W7117458386","doi":"https://doi.org/10.1145/3714394.3756154","title":"MagiaSVS: Singing Voice Synthesis with Lyrics and Pitch Guidance via a Unified-Modal Large Language Model","display_name":"MagiaSVS: Singing Voice Synthesis with Lyrics and Pitch Guidance via a Unified-Modal Large Language Model","publication_year":2025,"publication_date":"2025-10-12","ids":{"openalex":"https://openalex.org/W7117458386","doi":"https://doi.org/10.1145/3714394.3756154"},"language":null,"primary_location":{"id":"doi:10.1145/3714394.3756154","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3714394.3756154","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion of the 2025 ACM International Joint Conference on Pervasive and Ubiquitous Computing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101501482","display_name":"Hao Zhou","orcid":"https://orcid.org/0000-0002-1051-1862"},"institutions":[{"id":"https://openalex.org/I205237279","display_name":"Nankai University","ror":"https://ror.org/01y1kjr75","country_code":"CN","type":"education","lineage":["https://openalex.org/I205237279"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Hao Zhou","raw_affiliation_strings":["Tianjin Key Laboratory of Software Experience and Human Computer Interaction, Nankai University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin Key Laboratory of Software Experience and Human Computer Interaction, Nankai University, Tianjin, China","institution_ids":["https://openalex.org/I205237279"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121461504","display_name":"Zhiyue Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhiyue Wu","raw_affiliation_strings":["Independent, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Independent, Shanghai, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121489505","display_name":"Xingjian Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xingjian Du","raw_affiliation_strings":["Independent, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Independent, Shanghai, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100664366","display_name":"Haining Zhang","orcid":"https://orcid.org/0000-0002-4367-4369"},"institutions":[{"id":"https://openalex.org/I205237279","display_name":"Nankai University","ror":"https://ror.org/01y1kjr75","country_code":"CN","type":"education","lineage":["https://openalex.org/I205237279"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haining Zhang","raw_affiliation_strings":["Tianjin Key Laboratory of Software Experience and Human Computer Interaction, Nankai University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin Key Laboratory of Software Experience and Human Computer Interaction, Nankai University, Tianjin, China","institution_ids":["https://openalex.org/I205237279"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5121449811","display_name":"Binhui Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I205237279","display_name":"Nankai University","ror":"https://ror.org/01y1kjr75","country_code":"CN","type":"education","lineage":["https://openalex.org/I205237279"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Binhui Wang","raw_affiliation_strings":["College of Software, Nankai University, Tianjin, China and Innovation and Intelligent Design Center(I\u00b2DC), Nankai University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"College of Software, Nankai University, Tianjin, China and Innovation and Intelligent Design Center(I\u00b2DC), Nankai University, Tianjin, China","institution_ids":["https://openalex.org/I205237279"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101501482"],"corresponding_institution_ids":["https://openalex.org/I205237279"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.66674179,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"656","last_page":"661"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.19419999420642853,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.19419999420642853,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.15620000660419464,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.15379999577999115,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/lyrics","display_name":"Lyrics","score":0.7026000022888184},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.6074000000953674},{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.5898000001907349},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.47679999470710754},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4542999863624573},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.45399999618530273},{"id":"https://openalex.org/keywords/wearable-computer","display_name":"Wearable computer","score":0.43470001220703125},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.3993000090122223}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7868000268936157},{"id":"https://openalex.org/C2776436406","wikidata":"https://www.wikidata.org/wiki/Q602446","display_name":"Lyrics","level":2,"score":0.7026000022888184},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.6074000000953674},{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.5898000001907349},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.47679999470710754},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4706999957561493},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4542999863624573},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.45399999618530273},{"id":"https://openalex.org/C150594956","wikidata":"https://www.wikidata.org/wiki/Q1334829","display_name":"Wearable computer","level":2,"score":0.43470001220703125},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3993000090122223},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.39640000462532043},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.37869998812675476},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.28439998626708984},{"id":"https://openalex.org/C54290928","wikidata":"https://www.wikidata.org/wiki/Q4845080","display_name":"Wearable technology","level":3,"score":0.2838999927043915},{"id":"https://openalex.org/C504749915","wikidata":"https://www.wikidata.org/wiki/Q9010971","display_name":"Speech technology","level":3,"score":0.2793999910354614},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.27619999647140503},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2671999931335449},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.26440000534057617},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.2531999945640564},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.25189998745918274},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.2508000135421753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3714394.3756154","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3714394.3756154","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion of the 2025 ACM International Joint Conference on Pervasive and Ubiquitous Computing","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.8223942518234253}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":1,"referenced_works":["https://openalex.org/W3215615641"],"related_works":[],"abstract_inverted_index":{"The":[0],"rise":[1],"of":[2,17,37,164,180],"large":[3],"language":[4,55],"models":[5],"has":[6],"created":[7],"new":[8],"frontiers":[9],"for":[10,147],"personalized":[11],"content":[12],"creation,":[13],"a":[14,31,46,53,66,71,82,123],"key":[15,59],"aspect":[16],"ubiquitous":[18],"interaction.":[19],"In":[20],"this":[21,102],"paper,":[22],"we":[23],"explore":[24,144],"their":[25],"application":[26],"to":[27,77,109,133,138],"Singing":[28],"Voice":[29],"Synthesis,":[30],"challenging":[32],"task":[33],"requiring":[34],"the":[35,107,112,136,139,145,161,174,178],"alignment":[36],"linguistic,":[38],"melodic,":[39],"and":[40,70,94,99,118,130,152,176],"acoustic":[41,156],"information.":[42],"We":[43,121,142],"introduce":[44],"MagiaSVS,":[45],"novel":[47],"framework":[48],"that":[49,64],"re-frames":[50],"SVS":[51,140],"as":[52],"unified":[54,103],"modeling":[56],"problem.":[57],"Our":[58],"contribution":[60],"is":[61],"an":[62],"architecture":[63],"leverages":[65],"pre-trained":[67],"LLM":[68,108,137],"(Qwen3)":[69],"hierarchical":[72],"neural":[73],"audio":[74,96,166],"codec":[75],"(DAC)":[76],"model":[78],"multimodal":[79],"inputs":[80],"within":[81],"single,":[83],"shared":[84],"embedding":[85],"space.":[86],"By":[87],"projecting":[88],"lyrics":[89],"text":[90],"tokens,":[91,93],"pitch":[92],"disentangled":[95],"tokens":[97],"(semantic":[98],"acoustic)":[100],"into":[101],"space,":[104],"MagiaSVS":[105,148,181],"enables":[106],"directly":[110],"learn":[111],"intricate":[113],"relationships":[114],"between":[115],"lyrics,":[116],"melody,":[117],"vocal":[119],"timbre.":[120],"utilize":[122],"two-stage":[124],"training":[125],"strategy,":[126],"involving":[127],"modality-adaptive":[128],"pre-training":[129],"cross-modal":[131],"fine-tuning,":[132],"effectively":[134],"adapt":[135],"task.":[141],"also":[143],"potential":[146,184],"in":[149,185],"wearable":[150,186],"applications":[151],"its":[153,183],"interplay":[154],"with":[155],"sensing,":[157],"which":[158],"could":[159],"inform":[160],"future":[162],"development":[163],"interactive":[165],"experiences.":[167],"Future":[168],"work":[169],"will":[170],"focus":[171],"on":[172],"enhancing":[173],"performance":[175],"expanding":[177],"capabilities":[179],"including":[182],"applications.":[187]},"counts_by_year":[],"updated_date":"2026-02-23T20:09:44.859080","created_date":"2025-12-29T00:00:00"}
