{"id":"https://openalex.org/W4415536841","doi":"https://doi.org/10.1145/3746027.3755502","title":"UniTalker: Conversational Speech-Visual Synthesis","display_name":"UniTalker: Conversational Speech-Visual Synthesis","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415536841","doi":"https://doi.org/10.1145/3746027.3755502"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755502","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755502","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5048157403","display_name":"Yifan Hu","orcid":null},"institutions":[{"id":"https://openalex.org/I2722730","display_name":"Inner Mongolia University","ror":"https://ror.org/0106qb496","country_code":"CN","type":"education","lineage":["https://openalex.org/I2722730"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yifan Hu","raw_affiliation_strings":["Inner Mongolia University, Hohhot, China"],"affiliations":[{"raw_affiliation_string":"Inner Mongolia University, Hohhot, China","institution_ids":["https://openalex.org/I2722730"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082746577","display_name":"Rui Liu","orcid":"https://orcid.org/0000-0003-4524-7413"},"institutions":[{"id":"https://openalex.org/I2722730","display_name":"Inner Mongolia University","ror":"https://ror.org/0106qb496","country_code":"CN","type":"education","lineage":["https://openalex.org/I2722730"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rui Liu","raw_affiliation_strings":["Inner Mongolia University, Hohhot, China"],"affiliations":[{"raw_affiliation_string":"Inner Mongolia University, Hohhot, China","institution_ids":["https://openalex.org/I2722730"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108582272","display_name":"Yi Ren","orcid":"https://orcid.org/0000-0002-9160-3848"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yi Ren","raw_affiliation_strings":["ByteDance, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"ByteDance, Singapore, Singapore","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102916605","display_name":"Xiang Yin","orcid":"https://orcid.org/0000-0003-0472-2783"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiang Yin","raw_affiliation_strings":["ByteDance, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ByteDance, Shanghai, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5032690182","display_name":"Haizhou Li","orcid":"https://orcid.org/0000-0001-9158-9401"},"institutions":[{"id":"https://openalex.org/I4210099586","display_name":"Shenzhen Research Institute of Big Data","ror":"https://ror.org/00z1gwf89","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210099586"]},{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haizhou Li","raw_affiliation_strings":["SRIBD, School of Data Science, The Chinese University of Hong Kong, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"SRIBD, School of Data Science, The Chinese University of Hong Kong, Shenzhen, China","institution_ids":["https://openalex.org/I4210116924","https://openalex.org/I4210099586"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5048157403"],"corresponding_institution_ids":["https://openalex.org/I2722730"],"apc_list":null,"apc_paid":null,"fwci":11.5151,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.98146675,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"10248","last_page":"10257"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9843999743461609,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9843999743461609,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9782000184059143,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12032","display_name":"Multisensory perception and integration","score":0.975600004196167,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/rendering","display_name":"Rendering (computer graphics)","score":0.583899974822998},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5030999779701233},{"id":"https://openalex.org/keywords/facial-expression","display_name":"Facial expression","score":0.49790000915527344},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.4237000048160553},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.39980000257492065},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.39329999685287476},{"id":"https://openalex.org/keywords/multimodal-interaction","display_name":"Multimodal interaction","score":0.38449999690055847},{"id":"https://openalex.org/keywords/natural","display_name":"Natural (archaeology)","score":0.3790000081062317},{"id":"https://openalex.org/keywords/natural-language-understanding","display_name":"Natural language understanding","score":0.3531999886035919}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8047000169754028},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.583899974822998},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5030999779701233},{"id":"https://openalex.org/C195704467","wikidata":"https://www.wikidata.org/wiki/Q327968","display_name":"Facial expression","level":2,"score":0.49790000915527344},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.42500001192092896},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.4237000048160553},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.39980000257492065},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.39329999685287476},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.387800008058548},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.38449999690055847},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.38109999895095825},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.3790000081062317},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3774999976158142},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.3531999886035919},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.34769999980926514},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3425999879837036},{"id":"https://openalex.org/C2780297707","wikidata":"https://www.wikidata.org/wiki/Q4895393","display_name":"Landmark","level":2,"score":0.325300008058548},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.3237000107765198},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3061000108718872},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.30329999327659607},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2946999967098236},{"id":"https://openalex.org/C164850336","wikidata":"https://www.wikidata.org/wiki/Q3685487","display_name":"Interpersonal communication","level":2,"score":0.2937999963760376},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.28600001335144043},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.2847000062465668},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.26190000772476196},{"id":"https://openalex.org/C147168706","wikidata":"https://www.wikidata.org/wiki/Q1457734","display_name":"Recurrent neural network","level":3,"score":0.2535000145435333}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755502","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755502","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W2161907179","https://openalex.org/W2749468224","https://openalex.org/W2803193013","https://openalex.org/W2962785568","https://openalex.org/W2970641574","https://openalex.org/W3003243900","https://openalex.org/W3081492798","https://openalex.org/W3099284785","https://openalex.org/W3151309757","https://openalex.org/W3180355996","https://openalex.org/W4304013787","https://openalex.org/W4307680525","https://openalex.org/W4312722235","https://openalex.org/W4372260474","https://openalex.org/W4375868859","https://openalex.org/W4385572830","https://openalex.org/W4386072021","https://openalex.org/W4387186497","https://openalex.org/W4387968649","https://openalex.org/W4389524500","https://openalex.org/W4392931276","https://openalex.org/W4392931281","https://openalex.org/W4393160807","https://openalex.org/W4399790320","https://openalex.org/W4402152357","https://openalex.org/W4402671204","https://openalex.org/W4402683976","https://openalex.org/W4403780722","https://openalex.org/W4404781824"],"related_works":[],"abstract_inverted_index":{"Conversational":[0,76],"Speech":[1],"Synthesis":[2,78],"(CSS)":[3],"is":[4,25,45,110],"a":[5,75,104,111,126,193,207],"key":[6,189],"task":[7,80],"in":[8,35,135,179],"the":[9,54,66,136,144,157,173,219],"user-agent":[10],"interaction":[11],"area,":[12],"aiming":[13],"to":[14,47,130,154,198],"generate":[15,163],"more":[16,232,239],"expressive":[17],"and":[18,29,51,119,143,161,166,184,200,224,235,241,249],"empathetic":[19,164,233],"speech":[20,52,165,234],"for":[21],"users.":[22],"However,":[23],"it":[24,92,124,149],"well-known":[26],"that":[27,114,172,228],"''listening''":[28],"''eye":[30],"contact''":[31],"play":[32],"crucial":[33],"roles":[34],"conveying":[36],"emotions":[37],"during":[38,218],"real-world":[39],"interpersonal":[40],"communication.":[41],"Existing":[42],"CSS":[43],"research":[44],"limited":[46],"perceiving":[48],"only":[49],"text":[50],"within":[53],"dialogue":[55,90,137],"context,":[56,91,138],"which":[57,109],"restricts":[58],"its":[59],"effectiveness.":[60],"Moreover,":[61],"speech-only":[62],"responses":[63],"further":[64],"constrain":[65],"interactive":[67],"experience.":[68],"To":[69,99,170],"address":[70],"these":[71],"limitations,":[72],"we":[73,102,186],"introduce":[74,187],"Speech-Visual":[77],"(CSVS)":[79],"as":[81],"an":[82],"extension":[83],"of":[84,181],"traditional":[85],"CSS.":[86],"By":[87],"leveraging":[88],"multimodal":[89,117,120,133],"provides":[93,236],"users":[94,237],"with":[95,238],"coherent":[96],"audiovisual":[97],"responses.":[98],"this":[100],"end,":[101],"develop":[103],"CSVS":[105],"system":[106],"named":[107],"UniTalker,":[108],"unified":[112],"model":[113,129,230],"seamlessly":[115],"integrates":[116],"perception":[118],"rendering":[121,217],"capabilities.":[122],"Specifically,":[123],"leverages":[125],"large-scale":[127],"language":[128],"comprehensively":[131],"understand":[132],"cues":[134],"including":[139],"speaker,":[140],"text,":[141],"speech,":[142],"talking-face":[145,168,244],"animations.":[146,169,245],"After":[147],"that,":[148],"employs":[150],"multi-task":[151],"sequence":[152],"prediction":[153],"first":[155],"infer":[156],"target":[158],"utterance's":[159],"emotion":[160],"then":[162],"natural":[167,240],"ensure":[171],"generated":[174,250],"speech-visual":[175,209],"content":[176],"remains":[177],"consistent":[178,243],"terms":[180],"emotion,":[182],"content,":[183],"duration,":[185],"three":[188],"optimizations:":[190],"1)":[191],"Designing":[192],"specialized":[194],"neural":[195],"landmark":[196],"codec":[197],"tokenize":[199],"reconstruct":[201],"facial":[202],"expression":[203],"sequences.":[204],"2)":[205],"Proposing":[206],"bimodal":[208],"hard":[210],"alignment":[211],"decoding":[212],"strategy.":[213],"3)":[214],"Applying":[215],"emotion-guided":[216],"generation":[220],"stage.":[221],"Comprehensive":[222],"objective":[223],"subjective":[225],"experiments":[226],"demonstrate":[227],"our":[229],"synthesizes":[231],"emotionally":[242],"The":[246],"source":[247],"code":[248],"samples":[251],"are":[252],"available":[253],"at:":[254],"https://github.com/AI-S2-Lab/UniTalker.":[255]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-25T00:00:00"}
