{"id":"https://openalex.org/W4372346571","doi":"https://doi.org/10.1109/icassp49357.2023.10095084","title":"Visual-Aware Text-to-Speech<sup>*</sup>","display_name":"Visual-Aware Text-to-Speech<sup>*</sup>","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372346571","doi":"https://doi.org/10.1109/icassp49357.2023.10095084"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10095084","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095084","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2306.12020","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006824435","display_name":"Mohan Zhou","orcid":"https://orcid.org/0000-0003-3250-4978"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Mohan Zhou","raw_affiliation_strings":["Harbin Institute of Technology,Harbin,China","Harbin Institute of Technology, Harbin, China"],"affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology,Harbin,China","institution_ids":["https://openalex.org/I204983213"]},{"raw_affiliation_string":"Harbin Institute of Technology, Harbin, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054533256","display_name":"Yalong Bai","orcid":"https://orcid.org/0000-0002-8416-9027"},"institutions":[{"id":"https://openalex.org/I4210103986","display_name":"Jingdong (China)","ror":"https://ror.org/01dkjkq64","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210103986"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yalong Bai","raw_affiliation_strings":["JD Explore Academy,Beijing,China","JD Explore Academy, Beijing, China"],"affiliations":[{"raw_affiliation_string":"JD Explore Academy,Beijing,China","institution_ids":["https://openalex.org/I4210103986"]},{"raw_affiliation_string":"JD Explore Academy, Beijing, China","institution_ids":["https://openalex.org/I4210103986"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100441569","display_name":"Wei Zhang","orcid":"https://orcid.org/0000-0002-1492-8286"},"institutions":[{"id":"https://openalex.org/I4210103986","display_name":"Jingdong (China)","ror":"https://ror.org/01dkjkq64","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210103986"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Zhang","raw_affiliation_strings":["JD Explore Academy,Beijing,China","JD Explore Academy, Beijing, China"],"affiliations":[{"raw_affiliation_string":"JD Explore Academy,Beijing,China","institution_ids":["https://openalex.org/I4210103986"]},{"raw_affiliation_string":"JD Explore Academy, Beijing, China","institution_ids":["https://openalex.org/I4210103986"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088760097","display_name":"Ting Yao","orcid":"https://orcid.org/0000-0001-7587-101X"},"institutions":[{"id":"https://openalex.org/I4210103986","display_name":"Jingdong (China)","ror":"https://ror.org/01dkjkq64","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210103986"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ting Yao","raw_affiliation_strings":["JD Explore Academy,Beijing,China","JD Explore Academy, Beijing, China"],"affiliations":[{"raw_affiliation_string":"JD Explore Academy,Beijing,China","institution_ids":["https://openalex.org/I4210103986"]},{"raw_affiliation_string":"JD Explore Academy, Beijing, China","institution_ids":["https://openalex.org/I4210103986"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101661008","display_name":"Tiejun Zhao","orcid":"https://orcid.org/0000-0003-4659-4935"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tiejun Zhao","raw_affiliation_strings":["Harbin Institute of Technology,Harbin,China","Harbin Institute of Technology, Harbin, China"],"affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology,Harbin,China","institution_ids":["https://openalex.org/I204983213"]},{"raw_affiliation_string":"Harbin Institute of Technology, Harbin, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017597537","display_name":"Tao Mei","orcid":"https://orcid.org/0000-0003-2497-7732"},"institutions":[{"id":"https://openalex.org/I4210103986","display_name":"Jingdong (China)","ror":"https://ror.org/01dkjkq64","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210103986"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tao Mei","raw_affiliation_strings":["JD Explore Academy,Beijing,China","JD Explore Academy, Beijing, China"],"affiliations":[{"raw_affiliation_string":"JD Explore Academy,Beijing,China","institution_ids":["https://openalex.org/I4210103986"]},{"raw_affiliation_string":"JD Explore Academy, Beijing, China","institution_ids":["https://openalex.org/I4210103986"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5006824435"],"corresponding_institution_ids":["https://openalex.org/I204983213"],"apc_list":null,"apc_paid":null,"fwci":0.4002,"has_fulltext":true,"cited_by_count":2,"citation_normalized_percentile":{"value":0.54081506,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6394192576408386},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.45386481285095215},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.42448702454566956},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.34682512283325195}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6394192576408386},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.45386481285095215},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.42448702454566956},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34682512283325195}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icassp49357.2023.10095084","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095084","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2306.12020","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2306.12020","pdf_url":"https://arxiv.org/pdf/2306.12020","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2306.12020","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2306.12020","pdf_url":"https://arxiv.org/pdf/2306.12020","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.5899999737739563,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4372346571.pdf"},"referenced_works_count":25,"referenced_works":["https://openalex.org/W1931802725","https://openalex.org/W2009674825","https://openalex.org/W2017107803","https://openalex.org/W2105649179","https://openalex.org/W2107037917","https://openalex.org/W2107740512","https://openalex.org/W2107860279","https://openalex.org/W2130086727","https://openalex.org/W2191779130","https://openalex.org/W2237250383","https://openalex.org/W2330486232","https://openalex.org/W2747874407","https://openalex.org/W2903739847","https://openalex.org/W2963609956","https://openalex.org/W2964449965","https://openalex.org/W2998572311","https://openalex.org/W3031414376","https://openalex.org/W3033411150","https://openalex.org/W4385490328","https://openalex.org/W6654799313","https://openalex.org/W6736996214","https://openalex.org/W6770108805","https://openalex.org/W6772349387","https://openalex.org/W6778823374","https://openalex.org/W6917585676"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W4402327032","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Dynamically":[0],"synthesizing":[1],"talking":[2],"speech":[3,50,99],"that":[4],"actively":[5],"responds":[6],"to":[7,29,48,89],"a":[8,42,86],"listening":[9],"head":[10],"is":[11],"critical":[12],"during":[13],"the":[14,19,25,31,64,75],"face-to-face":[15,67],"interaction.":[16],"For":[17],"example,":[18],"speaker":[20],"could":[21],"take":[22],"advantage":[23],"of":[24,63,77],"listener\u2019s":[26],"facial":[27],"expression":[28],"adjust":[30],"tones,":[32],"stressed":[33],"syllables,":[34],"or":[35],"pauses.":[36],"In":[37],"this":[38,81],"work,":[39],"we":[40,84],"present":[41],"new":[43],"visual-aware":[44],"text-to-speech":[45],"(VA-TTS)":[46],"task":[47],"synthesize":[49],"conditioned":[51],"on":[52,103],"both":[53],"textual":[54],"inputs":[55],"and":[56,94,119],"sequential":[57],"visual":[58,78,96],"feedback":[59],"(e.g.,":[60],"nod,":[61],"smile)":[62],"listener":[65,95],"in":[66],"communication.":[68],"Different":[69],"from":[70],"traditional":[71],"text-to-speech,":[72],"VA-TTS":[73],"highlights":[74],"impact":[76],"modality.":[79],"On":[80],"newly-minted":[82],"task,":[83],"devise":[85],"baseline":[87],"model":[88],"fuse":[90],"phoneme":[91],"linguistic":[92],"information":[93],"signals":[97],"for":[98,111],"synthesis.":[100],"Extensive":[101],"experiments":[102],"multimodal":[104],"conversation":[105],"dataset":[106],"ViCo-X":[107],"verify":[108],"our":[109],"proposal":[110],"generating":[112],"more":[113],"natural":[114],"audio":[115],"with":[116],"scenario-appropriate":[117],"rhythm":[118],"prosody.":[120]},"counts_by_year":[{"year":2023,"cited_by_count":2}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2023-05-07T00:00:00"}
