{"id":"https://openalex.org/W3009620048","doi":"https://doi.org/10.1109/wacv45572.2020.9093314","title":"Watch to Listen Clearly: Visual Speech Enhancement Driven Multi-modality Speech Recognition","display_name":"Watch to Listen Clearly: Visual Speech Enhancement Driven Multi-modality Speech Recognition","publication_year":2020,"publication_date":"2020-03-01","ids":{"openalex":"https://openalex.org/W3009620048","doi":"https://doi.org/10.1109/wacv45572.2020.9093314","mag":"3009620048"},"language":"en","primary_location":{"id":"doi:10.1109/wacv45572.2020.9093314","is_oa":false,"landing_page_url":"https://doi.org/10.1109/wacv45572.2020.9093314","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2020 IEEE Winter Conference on Applications of Computer Vision (WACV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5065635207","display_name":"Bo Xu","orcid":"https://orcid.org/0000-0001-6379-7617"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Bo Xu","raw_affiliation_strings":["Xpeng motors"],"affiliations":[{"raw_affiliation_string":"Xpeng motors","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004380209","display_name":"Jacob Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jacob Wang","raw_affiliation_strings":["Xpeng motors"],"affiliations":[{"raw_affiliation_string":"Xpeng motors","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054796879","display_name":"Cheng Lu","orcid":"https://orcid.org/0000-0002-1477-1020"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng Lu","raw_affiliation_strings":["Xpeng motors"],"affiliations":[{"raw_affiliation_string":"Xpeng motors","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5038064028","display_name":"Yandong Guo","orcid":"https://orcid.org/0000-0002-4594-8415"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yandong Guo","raw_affiliation_strings":["Xpeng motors"],"affiliations":[{"raw_affiliation_string":"Xpeng motors","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5065635207"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.0668,"has_fulltext":false,"cited_by_count":11,"citation_normalized_percentile":{"value":0.75540857,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"1626","last_page":"1635"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.995199978351593,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.7799334526062012},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7776573896408081},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7676520943641663},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.6473590135574341},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3795149326324463},{"id":"https://openalex.org/keywords/noise-reduction","display_name":"Noise reduction","score":0.22305232286453247}],"concepts":[{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.7799334526062012},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7776573896408081},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7676520943641663},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.6473590135574341},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3795149326324463},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.22305232286453247}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/wacv45572.2020.9093314","is_oa":false,"landing_page_url":"https://doi.org/10.1109/wacv45572.2020.9093314","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2020 IEEE Winter Conference on Applications of Computer Vision (WACV)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4000000059604645,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":67,"referenced_works":["https://openalex.org/W22517275","https://openalex.org/W1498436455","https://openalex.org/W1686810756","https://openalex.org/W2000399372","https://openalex.org/W2014621385","https://openalex.org/W2015143272","https://openalex.org/W2015394094","https://openalex.org/W2026028405","https://openalex.org/W2060510034","https://openalex.org/W2064675550","https://openalex.org/W2071096833","https://openalex.org/W2097117768","https://openalex.org/W2102113734","https://openalex.org/W2115252128","https://openalex.org/W2117539524","https://openalex.org/W2127141656","https://openalex.org/W2130942839","https://openalex.org/W2131774270","https://openalex.org/W2133564696","https://openalex.org/W2157331557","https://openalex.org/W2163605009","https://openalex.org/W2168744723","https://openalex.org/W2194775991","https://openalex.org/W2271840356","https://openalex.org/W2327501763","https://openalex.org/W2531409750","https://openalex.org/W2551572271","https://openalex.org/W2570575067","https://openalex.org/W2578229578","https://openalex.org/W2594690981","https://openalex.org/W2604379605","https://openalex.org/W2788241093","https://openalex.org/W2890952074","https://openalex.org/W2891205112","https://openalex.org/W2897492880","https://openalex.org/W2952746495","https://openalex.org/W2962824709","https://openalex.org/W2963019222","https://openalex.org/W2963082324","https://openalex.org/W2963403868","https://openalex.org/W2963528589","https://openalex.org/W2963654155","https://openalex.org/W2963820951","https://openalex.org/W2964171275","https://openalex.org/W2964207404","https://openalex.org/W2964283370","https://openalex.org/W2964308564","https://openalex.org/W2972513594","https://openalex.org/W3105000568","https://openalex.org/W4298112588","https://openalex.org/W4385245566","https://openalex.org/W6629815555","https://openalex.org/W6637373629","https://openalex.org/W6675365184","https://openalex.org/W6677618333","https://openalex.org/W6679434410","https://openalex.org/W6679436768","https://openalex.org/W6684191040","https://openalex.org/W6732872814","https://openalex.org/W6734491695","https://openalex.org/W6735927292","https://openalex.org/W6739901393","https://openalex.org/W6749863746","https://openalex.org/W6754392867","https://openalex.org/W6754420807","https://openalex.org/W6780226713","https://openalex.org/W6785715069"],"related_works":["https://openalex.org/W2798138305","https://openalex.org/W3107474891","https://openalex.org/W2653598178","https://openalex.org/W2893763841","https://openalex.org/W2368779261","https://openalex.org/W2794438528","https://openalex.org/W2293562818","https://openalex.org/W2778699561","https://openalex.org/W2995996972","https://openalex.org/W3009620048"],"abstract_inverted_index":{"Multi-modality":[0],"(talking":[1],"face":[2,119],"video":[3,130],"and":[4,91,204],"audio)":[5],"information":[6],"helps":[7],"improve":[8],"speech":[9,34,71,87,97],"recognition":[10,35,72,98],"performance":[11,31],"compared":[12],"to":[13,106,120,138,150],"the":[14,20,30,39,49,56,60,76,85,92,95,116,125,135,164,182,196,201,205],"single":[15],"modality.":[16,123],"In":[17,63,74],"noisy":[18,43,57],"environments,":[19],"effect":[21],"of":[22,32,38,48,80,163,184],"audio":[23,44,50,61,122,126,185],"modality":[24,51,127,131],"is":[25,84,94,104,159,172],"weakened,":[26],"which":[27,171],"further":[28],"affects":[29],"multi-modality":[33,70,96],"(MSR).":[36],"Most":[37],"MSR":[40,136,157,188,198],"methods":[41],"use":[42],"signal":[45],"as":[46],"input":[47],"without":[52],"any":[53],"enhancement":[54,88,186],"(filtering":[55],"components":[58],"in":[59,177],"signal).":[62],"this":[64],"paper,":[65],"we":[66],"propose":[67],"an":[68],"audio-enhanced":[69],"model.":[73],"particular,":[75],"proposed":[77,193],"model":[78],"consists":[79],"two":[81],"sub-networks,":[82],"one":[83],"visual":[86,148,154],"(VE)":[89],"sub-network":[90,103,137,158],"other":[93],"(MSR)":[99],"sub-network.":[100],"The":[101,156,192],"VE":[102],"able":[105],"separate":[107],"a":[108,143],"speaker\u2019s":[109],"voice":[110],"from":[111],"background":[112],"noises":[113],"when":[114],"given":[115],"corresponding":[117],"talking":[118],"enhance":[121],"Then":[124],"together":[128],"with":[129],"are":[132],"fed":[133],"into":[134],"produce":[139],"characters.":[140],"We":[141,180],"introduce":[142],"pseudo-3D":[144],"residual":[145],"network":[146],"(P3D)based":[147],"front-end":[149],"extract":[151],"more":[152,173],"advantageous":[153],"features.":[155],"built":[160],"on":[161,200],"top":[162],"Element-wise-Attention":[165],"Gated":[166],"Recurrent":[167],"Unit":[168],"(EleAttGRU)":[169],"architecture":[170],"effective":[174],"than":[175],"Transformer":[176],"long":[178],"sequences.":[179],"demonstrate":[181],"effectiveness":[183],"for":[187],"by":[189],"extensive":[190],"experiments.":[191],"method":[194],"surpasses":[195],"state-of-the-art":[197],"models":[199],"LRS3-TED":[202],"dataset":[203],"LRW":[206],"dataset.":[207]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":3},{"year":2020,"cited_by_count":1},{"year":2019,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
