{"id":"https://openalex.org/W4403780871","doi":"https://doi.org/10.1145/3664647.3681465","title":"UniStyle: Unified Style Modeling for Speaking Style Captioning and Stylistic Speech Synthesis","display_name":"UniStyle: Unified Style Modeling for Speaking Style Captioning and Stylistic Speech Synthesis","publication_year":2024,"publication_date":"2024-10-26","ids":{"openalex":"https://openalex.org/W4403780871","doi":"https://doi.org/10.1145/3664647.3681465"},"language":"en","primary_location":{"id":"doi:10.1145/3664647.3681465","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3681465","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101883211","display_name":"Xinfa Zhu","orcid":"https://orcid.org/0000-0001-9275-523X"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xinfa Zhu","raw_affiliation_strings":["Northwestern Polytechnical University, Xi'an, China"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111283523","display_name":"Wenjie Tian","orcid":"https://orcid.org/0009-0000-3211-3219"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenjie Tian","raw_affiliation_strings":["Northwestern Polytechnical University, Xi'an, China"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021472684","display_name":"Xinsheng Wang","orcid":"https://orcid.org/0000-0003-1826-7419"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]},{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinsheng Wang","raw_affiliation_strings":["Microsoft, Beijing, China","Northwestern Polytechnical University, Xi'an, China"],"affiliations":[{"raw_affiliation_string":"Microsoft, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]},{"raw_affiliation_string":"Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101820632","display_name":"Lei He","orcid":"https://orcid.org/0009-0009-9870-8739"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei He","raw_affiliation_strings":["Microsoft, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Microsoft, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063120829","display_name":"Yujia Xiao","orcid":null},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yujia Xiao","raw_affiliation_strings":["Microsoft, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Microsoft, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100442237","display_name":"Xi Wang","orcid":"https://orcid.org/0000-0002-0434-7939"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]},{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xi Wang","raw_affiliation_strings":["Microsoft, Beijing, China","Northwestern Polytechnical University, Xi'an, China"],"affiliations":[{"raw_affiliation_string":"Microsoft, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]},{"raw_affiliation_string":"Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101522530","display_name":"Xu Tan","orcid":"https://orcid.org/0000-0001-5631-0639"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xu Tan","raw_affiliation_strings":["Microsoft, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Microsoft, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100329353","display_name":"Sheng Zhao","orcid":"https://orcid.org/0000-0002-9624-5381"},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sheng Zhao","raw_affiliation_strings":["Microsoft, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Microsoft, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100668966","display_name":"Lei Xie","orcid":"https://orcid.org/0000-0001-8234-0823"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Xie","raw_affiliation_strings":["Northwestern Polytechnical University, Xi'an, China"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5101883211"],"corresponding_institution_ids":["https://openalex.org/I17145004"],"apc_list":null,"apc_paid":null,"fwci":0.4986,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.66090562,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"7513","last_page":"7522"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.8632208704948425},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.7702061533927917},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7085228562355042},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5316327810287476},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4572437107563019},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.42087340354919434},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.40759336948394775},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3167286813259125},{"id":"https://openalex.org/keywords/art","display_name":"Art","score":0.08319643139839172},{"id":"https://openalex.org/keywords/literature","display_name":"Literature","score":0.04572129249572754},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.03711223602294922}],"concepts":[{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.8632208704948425},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.7702061533927917},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7085228562355042},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5316327810287476},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4572437107563019},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.42087340354919434},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.40759336948394775},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3167286813259125},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.08319643139839172},{"id":"https://openalex.org/C124952713","wikidata":"https://www.wikidata.org/wiki/Q8242","display_name":"Literature","level":1,"score":0.04572129249572754},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.03711223602294922},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3664647.3681465","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3681465","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W398859631","https://openalex.org/W1956340063","https://openalex.org/W2560674852","https://openalex.org/W2787378487","https://openalex.org/W2995181338","https://openalex.org/W3008039831","https://openalex.org/W3160329778","https://openalex.org/W3198048667","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4210777104","https://openalex.org/W4226421465","https://openalex.org/W4242982359","https://openalex.org/W4285106979","https://openalex.org/W4372260509","https://openalex.org/W4379033883","https://openalex.org/W4385822787","https://openalex.org/W4390306058","https://openalex.org/W4391620705","https://openalex.org/W4393147046"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W2775506363","https://openalex.org/W3088136942","https://openalex.org/W4290852288","https://openalex.org/W2949362007","https://openalex.org/W4283207562","https://openalex.org/W2963177403","https://openalex.org/W2330246314","https://openalex.org/W2949522393","https://openalex.org/W4289422896"],"abstract_inverted_index":{"Understanding":[0],"the":[1,6,9,56,81,86,99,109,121,127],"speaking":[2,35,59,160,172],"style,":[3],"such":[4],"as":[5,49,106],"emotion":[7],"of":[8,58,69,80,101,111,129],"interlocutor's":[10],"speech,":[11],"and":[12,34,62,72,94,108,136,163,171],"responding":[13],"with":[14,104,120,167],"speech":[15,32,64,76,92,105,118,122,166],"in":[16,24,159,174],"an":[17,44],"appropriate":[18],"style":[19,36,60,74,112,161],"is":[20,51,83],"a":[21,70,73,134,175],"natural":[22],"occurrence":[23],"human":[25],"conversations.":[26],"However,":[27],"technically,":[28],"existing":[29],"research":[30],"on":[31,150],"synthesis":[33,119],"captioning":[37,61,162],"typically":[38],"proceeds":[39],"independently.":[40],"In":[41],"this":[42],"work,":[43],"innovative":[45],"framework,":[46],"referred":[47],"to":[48,53,84,125],"UniStyle,":[50],"proposed":[52],"incorporate":[54],"both":[55],"capabilities":[57],"style-controllable":[63],"synthesizing.":[65],"Specifically,":[66],"UniStyle":[67,155],"consists":[68],"UniConnector":[71,82],"prompt-based":[75],"generator.":[77,123],"The":[78],"role":[79],"bridge":[85],"gap":[87],"between":[88],"different":[89],"modalities,":[90],"namely":[91],"audio":[93],"text":[95,102,115],"descriptions.":[96],"It":[97],"enables":[98],"generation":[100],"descriptions":[103,116],"input":[107],"creation":[110],"representations":[113],"from":[114],"for":[117],"Besides,":[124],"overcome":[126],"issue":[128],"data":[130,142],"scarcity,":[131],"we":[132],"propose":[133],"two-stage":[135],"semi-supervised":[137],"training":[138],"strategy,":[139],"which":[140],"reduces":[141],"requirements":[143],"while":[144],"boosting":[145],"performance.":[146],"Extensive":[147],"experiments":[148],"conducted":[149],"open-source":[151],"corpora":[152],"demonstrate":[153],"that":[154],"achieves":[156],"state-of-the-art":[157],"performance":[158],"synthesizes":[164],"expressive":[165],"various":[168],"speaker":[169],"timbres":[170],"styles":[173],"zero-shot":[176],"manner.":[177]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
