{"id":"https://openalex.org/W3005559510","doi":"https://doi.org/10.1109/icassp40776.2020.9053397","title":"Looking Enhances Listening: Recovering Missing Speech Using Images","display_name":"Looking Enhances Listening: Recovering Missing Speech Using Images","publication_year":2020,"publication_date":"2020-04-09","ids":{"openalex":"https://openalex.org/W3005559510","doi":"https://doi.org/10.1109/icassp40776.2020.9053397","mag":"3005559510"},"language":"en","primary_location":{"id":"doi:10.1109/icassp40776.2020.9053397","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp40776.2020.9053397","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2002.05639","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5082028315","display_name":"Tejas Srinivasan","orcid":null},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Tejas Srinivasan","raw_affiliation_strings":["Language Technologies Institute, Carnegie Mellon University, U.S.A","Carnegie Mellon University"],"affiliations":[{"raw_affiliation_string":"Language Technologies Institute, Carnegie Mellon University, U.S.A","institution_ids":["https://openalex.org/I74973139"]},{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053704856","display_name":"Ramon Sanabria","orcid":null},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]},{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Ramon Sanabria","raw_affiliation_strings":["CSTR and ILCC, University of Edinburgh, UK","Carnegie Mellon University"],"affiliations":[{"raw_affiliation_string":"CSTR and ILCC, University of Edinburgh, UK","institution_ids":["https://openalex.org/I98677209"]},{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5085262529","display_name":"Florian Metze","orcid":"https://orcid.org/0000-0002-6663-8600"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]},{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Florian Metze","raw_affiliation_strings":["Language Technologies Institute, Carnegie Mellon University, U.S.A","University of Edinburgh,"],"affiliations":[{"raw_affiliation_string":"Language Technologies Institute, Carnegie Mellon University, U.S.A","institution_ids":["https://openalex.org/I74973139"]},{"raw_affiliation_string":"University of Edinburgh,","institution_ids":["https://openalex.org/I98677209"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5082028315"],"corresponding_institution_ids":["https://openalex.org/I74973139"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.01550691,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"6304","last_page":"6308"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7931758761405945},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6655958890914917},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.48689451813697815},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.48073410987854004},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.4733322262763977},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.47095948457717896},{"id":"https://openalex.org/keywords/active-listening","display_name":"Active listening","score":0.45575326681137085},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.4286853075027466},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.4227987825870514},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.2495776116847992},{"id":"https://openalex.org/keywords/communication","display_name":"Communication","score":0.08904305100440979},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.062321364879608154}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7931758761405945},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6655958890914917},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.48689451813697815},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48073410987854004},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.4733322262763977},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.47095948457717896},{"id":"https://openalex.org/C177291462","wikidata":"https://www.wikidata.org/wiki/Q423038","display_name":"Active listening","level":2,"score":0.45575326681137085},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.4286853075027466},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.4227987825870514},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.2495776116847992},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.08904305100440979},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.062321364879608154},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0}],"mesh":[],"locations_count":5,"locations":[{"id":"doi:10.1109/icassp40776.2020.9053397","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp40776.2020.9053397","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2002.05639","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2002.05639","pdf_url":"https://arxiv.org/pdf/2002.05639","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"mag:3005559510","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/2002.05639.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.2002.05639","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2002.05639","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"},{"id":"doi:10.17023/zr83-0f44","is_oa":true,"landing_page_url":"https://doi.org/10.17023/zr83-0f44","pdf_url":null,"source":{"id":"https://openalex.org/S7407051697","display_name":"IEEE RESOURCE CENTERS","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2002.05639","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2002.05639","pdf_url":"https://arxiv.org/pdf/2002.05639","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.47999998927116394}],"awards":[{"id":"https://openalex.org/G4713059963","display_name":null,"funder_award_id":"FA8750","funder_id":"https://openalex.org/F4320332180","funder_display_name":"Defense Advanced Research Projects Agency"},{"id":"https://openalex.org/G740863221","display_name":null,"funder_award_id":"FA8750-18-2-0018","funder_id":"https://openalex.org/F4320332180","funder_display_name":"Defense Advanced Research Projects Agency"}],"funders":[{"id":"https://openalex.org/F4320332180","display_name":"Defense Advanced Research Projects Agency","ror":"https://ror.org/02caytj08"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3005559510.pdf","grobid_xml":"https://content.openalex.org/works/W3005559510.grobid-xml"},"referenced_works_count":29,"referenced_works":["https://openalex.org/W1524333225","https://openalex.org/W1933349210","https://openalex.org/W2043701535","https://openalex.org/W2194775991","https://openalex.org/W2295158492","https://openalex.org/W2327501763","https://openalex.org/W2509282593","https://openalex.org/W2512544816","https://openalex.org/W2586850765","https://openalex.org/W2623210963","https://openalex.org/W2714726990","https://openalex.org/W2732026016","https://openalex.org/W2768661419","https://openalex.org/W2889903020","https://openalex.org/W2914781455","https://openalex.org/W2953472911","https://openalex.org/W2962826786","https://openalex.org/W2962862718","https://openalex.org/W2962866381","https://openalex.org/W2963360627","https://openalex.org/W2964182350","https://openalex.org/W3042657922","https://openalex.org/W3098507616","https://openalex.org/W4249013746","https://openalex.org/W6631362777","https://openalex.org/W6697284302","https://openalex.org/W6740934225","https://openalex.org/W6755559483","https://openalex.org/W6764984290"],"related_works":["https://openalex.org/W2964182350","https://openalex.org/W2006798524","https://openalex.org/W3163142165","https://openalex.org/W2790326622","https://openalex.org/W1933749547","https://openalex.org/W2935938411","https://openalex.org/W2186573323","https://openalex.org/W2622954728","https://openalex.org/W2105523781","https://openalex.org/W3015785290","https://openalex.org/W3081997829","https://openalex.org/W142758689","https://openalex.org/W2889624961","https://openalex.org/W3205533980","https://openalex.org/W2889596223","https://openalex.org/W3134538211","https://openalex.org/W2010377246","https://openalex.org/W2586665961","https://openalex.org/W3097385349","https://openalex.org/W3107909154"],"abstract_inverted_index":{"Speech":[0],"is":[1],"understood":[2],"better":[3],"by":[4,88,128],"using":[5,92],"visual":[6,65,94,100,131],"context;":[7],"for":[8],"this":[9,50],"reason,":[10],"there":[11],"have":[12],"been":[13],"many":[14],"attempts":[15],"to":[16,19,106,126],"use":[17,37],"images":[18,38],"adapt":[20],"automatic":[21],"speech":[22],"recognition":[23],"(ASR)":[24],"systems.":[25],"Current":[26],"work,":[27],"however,":[28],"has":[29],"shown":[30],"that":[31,73,98,117],"visually":[32],"adapted":[33],"ASR":[34,75,120],"models":[35,76],"only":[36],"as":[39],"a":[40,54],"regularization":[41],"signal,":[42,87],"while":[43],"completely":[44],"ignoring":[45],"their":[46],"semantic":[47],"content.":[48],"In":[49],"paper,":[51],"we":[52,59],"present":[53],"set":[55],"of":[56,63],"experiments":[57],"where":[58],"show":[60,72],"the":[61,64,84,93,130],"utility":[62],"modality":[66],"under":[67],"noisy":[68],"conditions.":[69],"Our":[70],"results":[71,115],"multimodal":[74,119],"can":[77,102,122],"recover":[78],"words":[79],"which":[80],"are":[81],"masked":[82,111],"in":[83,104,110],"input":[85],"acoustic":[86],"grounding":[89],"its":[90],"transcriptions":[91],"representations.":[95],"We":[96],"observe":[97],"integrating":[99],"context":[101],"result":[103],"up":[105],"35%":[107],"relative":[108],"improvement":[109],"word":[112],"recovery.":[113],"These":[114],"demonstrate":[116],"end-to-end":[118],"systems":[121],"become":[123],"more":[124],"robust":[125],"noise":[127],"leveraging":[129],"context.":[132]},"counts_by_year":[],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
