{"id":"https://openalex.org/W3162596541","doi":"https://doi.org/10.1109/icpr48806.2021.9412817","title":"Robust Audio-Visual Speech Recognition Based on Hybrid Fusion","display_name":"Robust Audio-Visual Speech Recognition Based on Hybrid Fusion","publication_year":2021,"publication_date":"2021-01-10","ids":{"openalex":"https://openalex.org/W3162596541","doi":"https://doi.org/10.1109/icpr48806.2021.9412817","mag":"3162596541"},"language":"en","primary_location":{"id":"doi:10.1109/icpr48806.2021.9412817","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icpr48806.2021.9412817","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2020 25th International Conference on Pattern Recognition (ICPR)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100410326","display_name":"Hong Liu","orcid":"https://orcid.org/0000-0002-7498-6541"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Hong Liu","raw_affiliation_strings":["Key Laboratory of Machine Perception Shenzhen Graduate School, Peking University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Machine Perception Shenzhen Graduate School, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100362614","display_name":"Wenhao Li","orcid":"https://orcid.org/0000-0001-8048-2668"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenhao Li","raw_affiliation_strings":["Key Laboratory of Machine Perception Shenzhen Graduate School, Peking University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Machine Perception Shenzhen Graduate School, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5086513946","display_name":"Bing Yang","orcid":"https://orcid.org/0000-0002-8978-2322"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bing Yang","raw_affiliation_strings":["Key Laboratory of Machine Perception Shenzhen Graduate School, Peking University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Machine Perception Shenzhen Graduate School, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100410326"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":1.0665,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.76308316,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"7580","last_page":"7586"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.9933000206947327,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8098869323730469},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.7346649169921875},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.6876860857009888},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6382718086242676},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6167502999305725},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4608248472213745},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.44616007804870605},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4121784567832947}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8098869323730469},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.7346649169921875},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.6876860857009888},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6382718086242676},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6167502999305725},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4608248472213745},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.44616007804870605},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4121784567832947},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icpr48806.2021.9412817","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icpr48806.2021.9412817","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2020 25th International Conference on Pattern Recognition (ICPR)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","score":0.6100000143051147,"display_name":"Reduced inequalities"}],"awards":[{"id":"https://openalex.org/G401644923","display_name":null,"funder_award_id":"61673030,U1613209","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W1530111162","https://openalex.org/W1974387177","https://openalex.org/W2015143272","https://openalex.org/W2035777533","https://openalex.org/W2055098062","https://openalex.org/W2060510034","https://openalex.org/W2096391593","https://openalex.org/W2164337182","https://openalex.org/W2164450870","https://openalex.org/W2285716245","https://openalex.org/W2474638510","https://openalex.org/W2551572271","https://openalex.org/W2578229578","https://openalex.org/W2594690981","https://openalex.org/W2619383789","https://openalex.org/W2884454321","https://openalex.org/W2889624961","https://openalex.org/W2889839708","https://openalex.org/W2890952074","https://openalex.org/W2898926657","https://openalex.org/W2901907199","https://openalex.org/W2952746495","https://openalex.org/W2963295405","https://openalex.org/W2963528589","https://openalex.org/W2963654155","https://openalex.org/W2963785710","https://openalex.org/W2970270554","https://openalex.org/W2996970093","https://openalex.org/W3102469722","https://openalex.org/W3127334095","https://openalex.org/W4244018879","https://openalex.org/W4298112588","https://openalex.org/W6684209796"],"related_works":["https://openalex.org/W2965546495","https://openalex.org/W4389116644","https://openalex.org/W2153315159","https://openalex.org/W3103844505","https://openalex.org/W259157601","https://openalex.org/W4205463238","https://openalex.org/W2761785940","https://openalex.org/W2110523656","https://openalex.org/W1482209366","https://openalex.org/W2521627374"],"abstract_inverted_index":{"The":[0],"fusion":[1,22,27,40,64,81,146],"of":[2,11,44,49,58,61,70,162,195],"audio":[3,117],"and":[4,66,88,103,118,132],"visual":[5,119],"modalities":[6,36],"is":[7,17,95,113,148],"an":[8,46],"important":[9],"stage":[10],"audio-visual":[12,111,163],"speech":[13],"recognition":[14],"(AVSR),":[15],"which":[16,94,171],"generally":[18],"approached":[19],"through":[20],"feature":[21,131],"or":[23],"decision":[24,39,145],"fusion.":[25],"Feature":[26],"can":[28],"exploit":[29],"the":[30,42,59,62,68,134,159,177,186,193,196],"covariations":[31],"between":[32,137],"features":[33,120],"from":[34],"different":[35,141],"effectively,":[37],"whereas":[38],"shows":[41,172],"robustness":[43,194],"capturing":[45],"optimal":[47],"combination":[48],"multimodality.":[50],"In":[51],"this":[52],"work,":[53],"to":[54,97,115,126,150,156],"take":[55],"full":[56],"advantage":[57],"complementarity":[60],"two":[63],"strategies":[65],"address":[67],"challenge":[69],"inherent":[71],"ambiguity":[72],"in":[73,100,154,175],"noisy":[74,104,201],"environments,":[75],"we":[76,166],"propose":[77],"a":[78,107,122,144,168],"novel":[79],"hybrid":[80],"based":[82],"AVSR":[83],"method":[84,198],"with":[85],"residual":[86],"networks":[87],"Bidirectional":[89],"Gated":[90],"Recurrent":[91],"Unit":[92],"(BGRU),":[93],"able":[96],"distinguish":[98],"homophones":[99],"both":[101],"clean":[102],"conditions.":[105,202],"Specifically,":[106],"simple":[108],"yet":[109],"effective":[110],"encoder":[112],"used":[114],"map":[116],"into":[121],"shared":[123],"latent":[124],"space":[125],"capture":[127],"more":[128],"discriminative":[129],"multi-modal":[130],"find":[133],"internal":[135],"correlation":[136],"spatial-temporal":[138],"information":[139],"for":[140],"modalities.":[142,182],"Furthermore,":[143],"module":[147],"designed":[149],"get":[151],"final":[152],"predictions":[153],"order":[155],"robustly":[157],"utilize":[158],"reliability":[160],"measures":[161],"information.":[164],"Finally,":[165],"introduce":[167],"combined":[169],"loss,":[170],"its":[173],"noise-robustness":[174],"learning":[176],"joint":[178],"representation":[179],"across":[180],"various":[181,200],"Experimental":[183],"results":[184],"on":[185],"largest":[187],"publicly":[188],"available":[189],"dataset":[190],"(LRW)":[191],"demonstrate":[192],"proposed":[197],"under":[199]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
