{"id":"https://openalex.org/W4375869161","doi":"https://doi.org/10.1109/icassp49357.2023.10095796","title":"CN-CVS: A Mandarin Audio-Visual Dataset for Large Vocabulary Continuous Visual to Speech Synthesis","display_name":"CN-CVS: A Mandarin Audio-Visual Dataset for Large Vocabulary Continuous Visual to Speech Synthesis","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4375869161","doi":"https://doi.org/10.1109/icassp49357.2023.10095796"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10095796","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095796","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100418547","display_name":"Chen Chen","orcid":"https://orcid.org/0000-0003-1957-6432"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chen Chen","raw_affiliation_strings":["Tsinghua University,Center for Speech and Language Technologies, BNRist,China","Department of Computer Science and Technology, Tsinghua University, China","Center for Speech and Language Technologies, BNRist, Tsinghua University, China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Center for Speech and Language Technologies, BNRist,China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Center for Speech and Language Technologies, BNRist, Tsinghua University, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100391452","display_name":"Dong Wang","orcid":"https://orcid.org/0000-0002-1964-3984"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dong Wang","raw_affiliation_strings":["Tsinghua University,Center for Speech and Language Technologies, BNRist,China","Center for Speech and Language Technologies, BNRist, Tsinghua University, China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Center for Speech and Language Technologies, BNRist,China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Center for Speech and Language Technologies, BNRist, Tsinghua University, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5084318285","display_name":"Thomas Fang Zheng","orcid":"https://orcid.org/0000-0002-0249-4767"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Thomas Fang Zheng","raw_affiliation_strings":["Tsinghua University,Center for Speech and Language Technologies, BNRist,China","Center for Speech and Language Technologies, BNRist, Tsinghua University, China"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Center for Speech and Language Technologies, BNRist,China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Center for Speech and Language Technologies, BNRist, Tsinghua University, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100418547"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":4.2697,"has_fulltext":false,"cited_by_count":21,"citation_normalized_percentile":{"value":0.95430932,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8107348680496216},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.8006830215454102},{"id":"https://openalex.org/keywords/phrase","display_name":"Phrase","score":0.7529729604721069},{"id":"https://openalex.org/keywords/mandarin-chinese","display_name":"Mandarin Chinese","score":0.708020806312561},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6713945865631104},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5657088160514832},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.564993143081665},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.49890756607055664},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.4624939262866974},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4321712851524353},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4155910015106201},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.111197829246521},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.0804707407951355},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.06810390949249268}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8107348680496216},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.8006830215454102},{"id":"https://openalex.org/C2776224158","wikidata":"https://www.wikidata.org/wiki/Q187931","display_name":"Phrase","level":2,"score":0.7529729604721069},{"id":"https://openalex.org/C138954614","wikidata":"https://www.wikidata.org/wiki/Q9192","display_name":"Mandarin Chinese","level":2,"score":0.708020806312561},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6713945865631104},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5657088160514832},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.564993143081665},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.49890756607055664},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.4624939262866974},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4321712851524353},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4155910015106201},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.111197829246521},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0804707407951355},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.06810390949249268},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10095796","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095796","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"No poverty","score":0.4000000059604645,"id":"https://metadata.un.org/sdg/1"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W2015143272","https://openalex.org/W2029199293","https://openalex.org/W2551572271","https://openalex.org/W2594690981","https://openalex.org/W2604379605","https://openalex.org/W2886945201","https://openalex.org/W2890952074","https://openalex.org/W2897492880","https://openalex.org/W2963019222","https://openalex.org/W2972563022","https://openalex.org/W2972756321","https://openalex.org/W2999528291","https://openalex.org/W3006974783","https://openalex.org/W3035626590","https://openalex.org/W3126260881","https://openalex.org/W3157840621","https://openalex.org/W3211862173","https://openalex.org/W4200635083","https://openalex.org/W4206204999","https://openalex.org/W4210396727","https://openalex.org/W4221164771","https://openalex.org/W4286902976","https://openalex.org/W4296069328","https://openalex.org/W4297841641","https://openalex.org/W4298112588","https://openalex.org/W4312638101","https://openalex.org/W6732872814","https://openalex.org/W6753038255","https://openalex.org/W6754392867","https://openalex.org/W6802457966","https://openalex.org/W6803359641","https://openalex.org/W6804785693","https://openalex.org/W6810661184","https://openalex.org/W6847902775"],"related_works":["https://openalex.org/W2374317326","https://openalex.org/W2990005675","https://openalex.org/W1603321096","https://openalex.org/W2394766824","https://openalex.org/W2078713291","https://openalex.org/W2361574037","https://openalex.org/W2386292991","https://openalex.org/W2364440891","https://openalex.org/W2393726922","https://openalex.org/W2366752344"],"abstract_inverted_index":{"Research":[0],"on":[1,103,111],"Video":[2],"to":[3,19,56,72],"Speech":[4],"Synthesis":[5],"(VTS)":[6],"surges":[7],"recently":[8],"and":[9,30,38,90,122,133],"the":[10,39,87,123,128],"focus":[11],"is":[12,32,41,117],"gradually":[13],"shifting":[14],"from":[15,66,127,141],"small-vocabulary":[16],"short-phrase":[17],"VTS":[18,22,84,116],"large-vocabulary":[20],"continuous":[21,53,115],"(LVC-VTS).":[23],"A":[24],"large-scale":[25,51],"dataset":[26,61,89,132],"with":[27,86],"sufficient":[28],"speakers":[29],"utterances":[31,65],"a":[33,50,82,119],"prerequisite":[34],"for":[35,139],"such":[36],"research,":[37],"database":[40],"certainly":[42],"language":[43],"dependent.In":[44],"this":[45],"paper,":[46],"we":[47],"introduce":[48],"CN-CVS,":[49,112],"Mandarin":[52],"visual-speech":[54,78],"dataset,":[55],"support":[57],"LVC-VTS":[58],"research.":[59],"The":[60,131],"contains":[62],"about":[63],"200k":[64],"more":[67,73],"than":[68,74],"2500":[69],"individuals,":[70],"amounting":[71],"300":[75],"hours":[76],"of":[77],"data.":[79],"We":[80],"built":[81],"state-of-the-art":[83],"model":[85],"new":[88],"conducted":[91],"preliminary":[92],"studies.":[93],"Our":[94],"results":[95],"show":[96],"that":[97,99,114],"models":[98],"achieve":[100],"good":[101],"performance":[102],"small":[104],"vocabulary":[105],"tasks":[106],"may":[107],"perform":[108],"very":[109],"poor":[110],"indicating":[113],"indeed":[118],"challenging":[120],"task,":[121],"main":[124],"challenge":[125],"comes":[126],"unconstrained":[129],"vocabulary.":[130],"baseline":[134],"code":[135],"can":[136],"be":[137],"downloaded":[138],"free":[140],"http://cncvs.cslt.org.":[142]},"counts_by_year":[{"year":2025,"cited_by_count":10},{"year":2024,"cited_by_count":9},{"year":2023,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
