{"id":"https://openalex.org/W4415537140","doi":"https://doi.org/10.1145/3746027.3755736","title":"HarmoniVox: Painting Voices to Match the Avatar's Soul","display_name":"HarmoniVox: Painting Voices to Match the Avatar's Soul","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415537140","doi":"https://doi.org/10.1145/3746027.3755736"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755736","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3755736","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3746027.3755736","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5026277472","display_name":"Songtao Zhou","orcid":"https://orcid.org/0009-0008-5972-3955"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Songtao Zhou","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018252335","display_name":"Xiaoyu Qin","orcid":"https://orcid.org/0000-0002-9720-3220"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoyu Qin","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100749226","display_name":"Yixuan Zhou","orcid":"https://orcid.org/0009-0002-6363-891X"},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yixuan Zhou","raw_affiliation_strings":["Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China","institution_ids":["https://openalex.org/I3131625388","https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101918812","display_name":"Qixin Wang","orcid":"https://orcid.org/0009-0009-5832-8192"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qixin Wang","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033137691","display_name":"Zeyu Jin","orcid":"https://orcid.org/0000-0001-8465-8878"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zeyu Jin","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103243439","display_name":"Zixuan Wang","orcid":"https://orcid.org/0000-0001-7291-6198"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zixuan Wang","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102869280","display_name":"Zhiyong Wu","orcid":"https://orcid.org/0000-0001-8533-0524"},"institutions":[{"id":"https://openalex.org/I3131625388","display_name":"University Town of Shenzhen","ror":"https://ror.org/05f5j6225","country_code":"CN","type":"education","lineage":["https://openalex.org/I3131625388"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiyong Wu","raw_affiliation_strings":["Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen International Graduate School, Tsinghua University, Shenzhen, China","institution_ids":["https://openalex.org/I3131625388","https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5039477812","display_name":"Jia Jia","orcid":"https://orcid.org/0009-0005-8449-278X"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jia Jia","raw_affiliation_strings":["Department of Computer Science and Technology, BNRist, Tsinghua University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, BNRist, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5026277472"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.38583925,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"6720","last_page":"6729"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10481","display_name":"Computer Graphics and Visualization Techniques","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/1704","display_name":"Computer Graphics and Computer-Aided Design"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9921000003814697,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/avatar","display_name":"Avatar","score":0.866599977016449},{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.6294999718666077},{"id":"https://openalex.org/keywords/animation","display_name":"Animation","score":0.599399983882904},{"id":"https://openalex.org/keywords/generalizability-theory","display_name":"Generalizability theory","score":0.4587000012397766},{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.4320000112056732},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.4117000102996826},{"id":"https://openalex.org/keywords/painting","display_name":"Painting","score":0.35530000925064087},{"id":"https://openalex.org/keywords/presentation","display_name":"Presentation (obstetrics)","score":0.3515999913215637}],"concepts":[{"id":"https://openalex.org/C2777365542","wikidata":"https://www.wikidata.org/wiki/Q83090","display_name":"Avatar","level":2,"score":0.866599977016449},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6565999984741211},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.6294999718666077},{"id":"https://openalex.org/C502989409","wikidata":"https://www.wikidata.org/wiki/Q11425","display_name":"Animation","level":2,"score":0.599399983882904},{"id":"https://openalex.org/C27158222","wikidata":"https://www.wikidata.org/wiki/Q5532422","display_name":"Generalizability theory","level":2,"score":0.4587000012397766},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.4320000112056732},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4296000003814697},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.4117000102996826},{"id":"https://openalex.org/C205783811","wikidata":"https://www.wikidata.org/wiki/Q11629","display_name":"Painting","level":2,"score":0.35530000925064087},{"id":"https://openalex.org/C2777601897","wikidata":"https://www.wikidata.org/wiki/Q3409113","display_name":"Presentation (obstetrics)","level":2,"score":0.3515999913215637},{"id":"https://openalex.org/C2780583480","wikidata":"https://www.wikidata.org/wiki/Q1366327","display_name":"Tone (literature)","level":2,"score":0.34950000047683716},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3474000096321106},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34360000491142273},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3361999988555908},{"id":"https://openalex.org/C2776453491","wikidata":"https://www.wikidata.org/wiki/Q5659234","display_name":"Harmony (color)","level":2,"score":0.32330000400543213},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.28459998965263367},{"id":"https://openalex.org/C2776539107","wikidata":"https://www.wikidata.org/wiki/Q176501","display_name":"Timbre","level":3,"score":0.28209999203681946},{"id":"https://openalex.org/C138591656","wikidata":"https://www.wikidata.org/wiki/Q5157538","display_name":"Computer facial animation","level":4,"score":0.2696000039577484},{"id":"https://openalex.org/C69369342","wikidata":"https://www.wikidata.org/wiki/Q1401416","display_name":"Computer animation","level":3,"score":0.2689000070095062},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2662000060081482},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.2574000060558319},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.25609999895095825},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.25529998540878296}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755736","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3755736","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3746027.3755736","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3755736","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G342889659","display_name":null,"funder_award_id":"No. 2024QY1400","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"},{"id":"https://openalex.org/G767264109","display_name":null,"funder_award_id":"No. 62425604","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1967554269","https://openalex.org/W1997060370","https://openalex.org/W2026133922","https://openalex.org/W2077881910","https://openalex.org/W2105445261","https://openalex.org/W2119131115","https://openalex.org/W2199250669","https://openalex.org/W2237250383","https://openalex.org/W2769666294","https://openalex.org/W2808631503","https://openalex.org/W2952370363","https://openalex.org/W3021656379","https://openalex.org/W3034192160","https://openalex.org/W3084572050","https://openalex.org/W3197199219","https://openalex.org/W3200832617","https://openalex.org/W3203514471","https://openalex.org/W4249238272","https://openalex.org/W4304080617","https://openalex.org/W4385823163","https://openalex.org/W4385825380","https://openalex.org/W4385825592","https://openalex.org/W4386072021","https://openalex.org/W4386361734","https://openalex.org/W4387969119","https://openalex.org/W4389302490","https://openalex.org/W4393147046","https://openalex.org/W4402703639","https://openalex.org/W4413147005"],"related_works":[],"abstract_inverted_index":{"Imagine":[0],"James":[1],"Bond":[2],"speaking":[3],"like":[4],"Mr.":[5],"Bean---such":[6],"a":[7,11,83,114,118,128,137,157],"mismatch":[8],"would":[9,197],"create":[10],"jarring":[12],"dissonance":[13],"and":[14,41,46,61,78,151,180,190],"break":[15],"the":[16,36,42,63,102,169,173,177,186],"viewer's":[17],"immersion.":[18],"Current":[19],"research":[20],"on":[21,27,96,193],"virtual":[22],"avatar":[23,91,153,187],"animation":[24],"has":[25],"focused":[26],"modeling":[28],"3D":[29],"geometry,":[30],"appearance,":[31,73],"motion":[32],"generation,":[33],"however,":[34],"neglecting":[35],"harmony":[37],"between":[38],"speech":[39,99,149,184],"prosody":[40,150],"avatar's":[43,103],"visual":[44,107,154,188],"presentation":[45],"contextual":[47],"environment.":[48],"In":[49],"this":[50,56,133,163],"paper,":[51],"we":[52,81,112,135],"seek":[53],"to":[54,146],"bridge":[55],"gap":[57],"by":[58],"firstly":[59],"identifying":[60],"defining":[62],"key":[64],"elements":[65],"necessary":[66],"for":[67],"achieving":[68],"audiovisual":[69],"harmony,":[70],"such":[71],"as":[72],"expression,":[74],"body":[75],"posture,":[76],"backgrounds":[77],"colors.":[79],"Subsequently,":[80],"propose":[82],"method":[84,171],"that":[85,168],"jointly":[86],"models":[87],"semantic":[88],"consistency":[89],"in":[90,127,175,200],"animation,":[92],"named":[93],"HarmoniVox,":[94],"specifically":[95],"crafting":[97],"prosodic":[98],"consistent":[100],"with":[101,117,185],"essence":[104],"from":[105],"given":[106],"image.":[108],"To":[109,131],"achieve":[110],"this,":[111],"implement":[113],"technical":[115],"framework":[116],"mutual":[119],"modal":[120],"contrastive":[121],"learning":[122],"strategy,":[123],"enhancing":[124],"multimodal":[125],"alignment":[126],"coarse-to-fine":[129],"fashion.":[130],"support":[132],"method,":[134],"establish":[136],"experimental":[138],"dataset":[139],"HarAvaSpeech":[140],"comprising":[141],"28,929":[142],"image-audio":[143],"pairs,":[144],"designed":[145],"encompass":[147],"expressive":[148],"rich":[152],"presentations":[155],"across":[156],"wide":[158],"range":[159],"of":[160,183],"contexts.":[161],"Leveraging":[162],"dataset,":[164],"our":[165],"experiments":[166],"demonstrate":[167],"proposed":[170],"outperforms":[172],"baselines":[174],"manipulating":[176],"nuanced":[178],"tone":[179],"harmonious":[181],"rhythm":[182],"presentations,":[189],"reveal":[191],"generalizability":[192],"out-of-domain":[194],"cases.":[195],"Demo":[196],"be":[198],"provided":[199],"https://harmonivox.github.io/harmonivox/.":[201]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-25T00:00:00"}
