{"id":"https://openalex.org/W3094706461","doi":"https://doi.org/10.21437/interspeech.2020-2935","title":"Visual Speech In Real Noisy Environments (VISION): A Novel Benchmark Dataset and Deep Learning-Based Baseline System","display_name":"Visual Speech In Real Noisy Environments (VISION): A Novel Benchmark Dataset and Deep Learning-Based Baseline System","publication_year":2020,"publication_date":"2020-10-25","ids":{"openalex":"https://openalex.org/W3094706461","doi":"https://doi.org/10.21437/interspeech.2020-2935","mag":"3094706461"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2020-2935","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2020-2935","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2020","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5068981769","display_name":"Mandar Gogate","orcid":"https://orcid.org/0000-0003-1712-9014"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Mandar Gogate","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058869522","display_name":"Kia Dashtipour","orcid":"https://orcid.org/0000-0002-9651-6487"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kia Dashtipour","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5062211930","display_name":"Amir Hussain","orcid":"https://orcid.org/0000-0002-8080-082X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Amir Hussain","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5068981769"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":3.6545,"has_fulltext":false,"cited_by_count":27,"citation_normalized_percentile":{"value":0.93931658,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"4521","last_page":"4525"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11447","display_name":"Blind Source Separation Techniques","score":0.9929999709129333,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.8898801803588867},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.8273354172706604},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.738631010055542},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6424846053123474},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.6187729835510254},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4346987009048462},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3336490988731384},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.08130595088005066},{"id":"https://openalex.org/keywords/cartography","display_name":"Cartography","score":0.07606926560401917},{"id":"https://openalex.org/keywords/geology","display_name":"Geology","score":0.06899788975715637}],"concepts":[{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.8898801803588867},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.8273354172706604},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.738631010055542},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6424846053123474},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.6187729835510254},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4346987009048462},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3336490988731384},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.08130595088005066},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.07606926560401917},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.06899788975715637},{"id":"https://openalex.org/C111368507","wikidata":"https://www.wikidata.org/wiki/Q43518","display_name":"Oceanography","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.21437/interspeech.2020-2935","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2020-2935","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2020","raw_type":"proceedings-article"},{"id":"pmh:oai:napier-surface.worktribe.com:2867010","is_oa":false,"landing_page_url":"http://researchrepository.napier.ac.uk/Output/2867010","pdf_url":null,"source":{"id":"https://openalex.org/S4306400544","display_name":"Research Output (Edinburgh Napier University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I251738","host_organization_name":"Edinburgh Napier University","host_organization_lineage":["https://openalex.org/I251738"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Conference Proceeding"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4769349813","display_name":null,"funder_award_id":"EP/M026981/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"}],"funders":[{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":10,"referenced_works":["https://openalex.org/W142945732","https://openalex.org/W1552314771","https://openalex.org/W2015143272","https://openalex.org/W2039846947","https://openalex.org/W2115252128","https://openalex.org/W2289394825","https://openalex.org/W2788241093","https://openalex.org/W2802541021","https://openalex.org/W2886232760","https://openalex.org/W2978232138"],"related_works":["https://openalex.org/W2378211422","https://openalex.org/W2383111961","https://openalex.org/W2365952365","https://openalex.org/W2352448290","https://openalex.org/W2380820513","https://openalex.org/W4321353415","https://openalex.org/W2913146933","https://openalex.org/W2745001401","https://openalex.org/W2372385138","https://openalex.org/W4296359239"],"abstract_inverted_index":{"In":[0,7,129],"this":[1],"paper,":[2],"we":[3,131],"present":[4,132],"VIsual":[5],"Speech":[6],"real":[8,28,63],"nOisy":[9],"eNvironments":[10],"(VISION),":[11],"a":[12,39,66,80,110,133],"first":[13],"of":[14,41,69,74,83,92,103,120,158],"its":[15],"kind":[16],"audio-visual":[17],"(AV)":[18],"corpus":[19,95,112],"comprising":[20],"2500":[21],"utterances":[22],"from":[23],"209":[24],"speakers,":[25,70],"recorded":[26,61],"in":[27,48,62,79,100,125],"noisy":[29,127],"environments":[30,64],"including":[31],"social":[32],"gatherings,":[33],"streets,":[34],"cafeterias":[35],"and":[36,86,107],"restaurants.":[37],"While":[38],"number":[40],"speech":[42,59,122,145,165],"enhancement":[43,123,166],"frameworks":[44],"have":[45],"been":[46],"proposed":[47],"the":[49,101,159],"literature":[50],"that":[51,113],"exploit":[52],"AV":[53,75,94,104,121],"cues,":[54],"there":[55],"are":[56],"no":[57],"visual":[58,85],"corpora":[60],"with":[65,150],"sufficient":[67],"variety":[68],"to":[71,97,108,163],"enable":[72],"evaluation":[73,119],"frameworks'":[76],"generalisation":[77],"capability":[78],"wide":[81],"range":[82],"background":[84],"acoustic":[87],"noises.":[88],"The":[89],"main":[90],"purpose":[91],"our":[93],"is":[96],"foster":[98],"research":[99],"area":[102],"signal":[105],"processing":[106],"provide":[109],"benchmark":[111],"can":[114],"be":[115],"used":[116],"for":[117,144],"reliable":[118],"systems":[124],"everyday":[126],"settings.":[128],"addition,":[130],"baseline":[134,160],"deep":[135],"neural":[136],"network":[137],"(DNN)":[138],"based":[139],"spectral":[140],"mask":[141],"estimation":[142],"model":[143],"enhancement.":[146],"Comparative":[147],"simulation":[148],"results":[149],"subjective":[151],"listening":[152],"tests":[153],"demonstrate":[154],"significant":[155],"performance":[156],"improvement":[157],"DNN":[161],"compared":[162],"state-of-the-art":[164],"approaches.":[167]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":5},{"year":2021,"cited_by_count":16}],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2025-10-10T00:00:00"}
