{"id":"https://openalex.org/W3103085143","doi":"https://doi.org/10.1109/icassp39728.2021.9414040","title":"Speech Prediction in Silent Videos Using Variational Autoencoders","display_name":"Speech Prediction in Silent Videos Using Variational Autoencoders","publication_year":2021,"publication_date":"2021-05-13","ids":{"openalex":"https://openalex.org/W3103085143","doi":"https://doi.org/10.1109/icassp39728.2021.9414040","mag":"3103085143"},"language":"en","primary_location":{"id":"doi:10.1109/icassp39728.2021.9414040","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp39728.2021.9414040","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2011.07340","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010648323","display_name":"Ravindra Yadav","orcid":"https://orcid.org/0000-0003-4628-0688"},"institutions":[{"id":"https://openalex.org/I94234084","display_name":"Indian Institute of Technology Kanpur","ror":"https://ror.org/05pjsgx75","country_code":"IN","type":"education","lineage":["https://openalex.org/I94234084"]}],"countries":["IN"],"is_corresponding":true,"raw_author_name":"Ravindra Yadav","raw_affiliation_strings":["Indian Institute of Technology Kanpur, India","Indian Institute of Technology - Kanpur, India#TAB#"],"affiliations":[{"raw_affiliation_string":"Indian Institute of Technology Kanpur, India","institution_ids":["https://openalex.org/I94234084"]},{"raw_affiliation_string":"Indian Institute of Technology - Kanpur, India#TAB#","institution_ids":["https://openalex.org/I94234084"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083439972","display_name":"Ashish Sardana","orcid":null},"institutions":[{"id":"https://openalex.org/I1304085615","display_name":"Nvidia (United Kingdom)","ror":"https://ror.org/02kr42612","country_code":"GB","type":"company","lineage":["https://openalex.org/I1304085615","https://openalex.org/I4210127875"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Ashish Sardana","raw_affiliation_strings":["NVIDIA","nVidia"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]},{"raw_affiliation_string":"nVidia","institution_ids":["https://openalex.org/I1304085615"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007109424","display_name":"Vinay P. Namboodiri","orcid":"https://orcid.org/0000-0001-5262-9722"},"institutions":[{"id":"https://openalex.org/I51601045","display_name":"University of Bath","ror":"https://ror.org/002h8g185","country_code":"GB","type":"education","lineage":["https://openalex.org/I51601045"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Vinay P Namboodiri","raw_affiliation_strings":["University of Bath, UK","#N#                  University of Bath, UK#N#"],"affiliations":[{"raw_affiliation_string":"University of Bath, UK","institution_ids":["https://openalex.org/I51601045"]},{"raw_affiliation_string":"#N#                  University of Bath, UK#N#","institution_ids":["https://openalex.org/I51601045"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5085503354","display_name":"Rajesh M. Hegde","orcid":"https://orcid.org/0000-0002-6142-7724"},"institutions":[{"id":"https://openalex.org/I94234084","display_name":"Indian Institute of Technology Kanpur","ror":"https://ror.org/05pjsgx75","country_code":"IN","type":"education","lineage":["https://openalex.org/I94234084"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Rajesh M Hegde","raw_affiliation_strings":["Indian Institute of Technology Kanpur, India","Indian Institute of Technology - Kanpur, India#TAB#"],"affiliations":[{"raw_affiliation_string":"Indian Institute of Technology Kanpur, India","institution_ids":["https://openalex.org/I94234084"]},{"raw_affiliation_string":"Indian Institute of Technology - Kanpur, India#TAB#","institution_ids":["https://openalex.org/I94234084"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5010648323"],"corresponding_institution_ids":["https://openalex.org/I94234084"],"apc_list":null,"apc_paid":null,"fwci":0.1539,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.39222662,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"7048","last_page":"7052"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9933000206947327,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7863545417785645},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.55987948179245},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.5275102257728577},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.5226906538009644},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5217869281768799},{"id":"https://openalex.org/keywords/ranging","display_name":"Ranging","score":0.5129973292350769},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.4808812439441681},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.4383709728717804},{"id":"https://openalex.org/keywords/signal","display_name":"SIGNAL (programming language)","score":0.42874160408973694},{"id":"https://openalex.org/keywords/grid","display_name":"Grid","score":0.42590227723121643},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4198591113090515},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.4080892503261566},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3395358622074127}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7863545417785645},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.55987948179245},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.5275102257728577},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.5226906538009644},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5217869281768799},{"id":"https://openalex.org/C115051666","wikidata":"https://www.wikidata.org/wiki/Q6522493","display_name":"Ranging","level":2,"score":0.5129973292350769},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.4808812439441681},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4383709728717804},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.42874160408973694},{"id":"https://openalex.org/C187691185","wikidata":"https://www.wikidata.org/wiki/Q2020720","display_name":"Grid","level":2,"score":0.42590227723121643},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4198591113090515},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.4080892503261566},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3395358622074127},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0}],"mesh":[],"locations_count":5,"locations":[{"id":"doi:10.1109/icassp39728.2021.9414040","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp39728.2021.9414040","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2011.07340","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2011.07340","pdf_url":"https://arxiv.org/pdf/2011.07340","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"mag:3103085143","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/2011.07340.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.2011.07340","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2011.07340","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"},{"id":"doi:10.17023/x8zh-0b82","is_oa":true,"landing_page_url":"https://doi.org/10.17023/x8zh-0b82","pdf_url":null,"source":{"id":"https://openalex.org/S7407051697","display_name":"IEEE RESOURCE CENTERS","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2011.07340","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2011.07340","pdf_url":"https://arxiv.org/pdf/2011.07340","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.6200000047683716}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3103085143.pdf"},"referenced_works_count":23,"referenced_works":["https://openalex.org/W1552314771","https://openalex.org/W1959608418","https://openalex.org/W2015143272","https://openalex.org/W2064675550","https://openalex.org/W2067295501","https://openalex.org/W2293856338","https://openalex.org/W2516001803","https://openalex.org/W2585824449","https://openalex.org/W2625027024","https://openalex.org/W2962835968","https://openalex.org/W2962897886","https://openalex.org/W2963019222","https://openalex.org/W2963609956","https://openalex.org/W2964095416","https://openalex.org/W2964243274","https://openalex.org/W2964352155","https://openalex.org/W2972563022","https://openalex.org/W3035626590","https://openalex.org/W6637373629","https://openalex.org/W6639732818","https://openalex.org/W6640963894","https://openalex.org/W6751750676","https://openalex.org/W6756197946"],"related_works":["https://openalex.org/W3045032902","https://openalex.org/W3187009280","https://openalex.org/W3098252333","https://openalex.org/W2611160234","https://openalex.org/W1480583224","https://openalex.org/W3209013111","https://openalex.org/W2990467045","https://openalex.org/W2056961398","https://openalex.org/W3141688548","https://openalex.org/W3015925607","https://openalex.org/W2995255435","https://openalex.org/W2613448434","https://openalex.org/W3036496243","https://openalex.org/W2461011248","https://openalex.org/W2982076115","https://openalex.org/W2186282052","https://openalex.org/W2963917086","https://openalex.org/W3093287838","https://openalex.org/W2946520073","https://openalex.org/W2594156432"],"abstract_inverted_index":{"Understanding":[0],"the":[1,4,37,51,55,69,79,84,90,123,129,134,140],"relationship":[2],"between":[3,68],"auditory":[5,124],"and":[6,20,42,58,116],"visual":[7,30,43,130],"signals":[8],"is":[9,34,45],"crucial":[10],"for":[11,102],"many":[12],"different":[13],"applications":[14],"ranging":[15],"from":[16],"computer-generated":[17],"imagery":[18],"(CGI)":[19],"video":[21],"editing":[22],"automation":[23],"to":[24,75,82,121],"assisting":[25],"people":[26],"with":[27],"hearing":[28],"or":[29],"impairments.":[31],"However,":[32],"this":[33,95],"challenging":[35],"since":[36],"distribution":[38,127],"of":[39,50,136],"both":[40],"audio":[41],"modality":[44],"inherently":[46],"multi-modal.":[47],"Therefore,":[48],"most":[49],"existing":[52],"methods":[53],"ignore":[54],"multimodal":[56],"aspect":[57],"assume":[59],"that":[60],"there":[61],"only":[62],"exists":[63],"a":[64,99,106],"deterministic":[65],"one-to-one":[66],"mapping":[67],"two":[70],"modalities.":[71],"It":[72],"can":[73],"lead":[74],"low-quality":[76],"predictions":[77],"as":[78],"model":[80,101,111,138],"collapses":[81],"optimizing":[83],"average":[85],"behavior":[86],"rather":[87],"than":[88],"learning":[89],"full":[91],"data":[92],"distributions.":[93],"In":[94],"paper,":[96],"we":[97],"present":[98],"stochastic":[100],"generating":[103],"speech":[104],"in":[105],"silent":[107],"video.":[108],"The":[109],"proposed":[110],"combines":[112],"recurrent":[113],"neural":[114],"networks":[115],"variational":[117],"deep":[118],"generative":[119],"models":[120],"learn":[122],"signal\u2019s":[125],"conditional":[126],"given":[128],"signal.":[131],"We":[132],"demonstrate":[133],"performance":[135],"our":[137],"on":[139,144],"GRID":[141],"dataset":[142],"based":[143],"standard":[145],"benchmarks.":[146]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-11T14:59:36.786465","created_date":"2025-10-10T00:00:00"}
