{"id":"https://openalex.org/W4319442337","doi":"https://doi.org/10.3390/s23041834","title":"Multimodal Sensor-Input Architecture with Deep Learning for Audio-Visual Speech Recognition in Wild","display_name":"Multimodal Sensor-Input Architecture with Deep Learning for Audio-Visual Speech Recognition in Wild","publication_year":2023,"publication_date":"2023-02-07","ids":{"openalex":"https://openalex.org/W4319442337","doi":"https://doi.org/10.3390/s23041834","pmid":"https://pubmed.ncbi.nlm.nih.gov/36850432"},"language":"en","primary_location":{"id":"doi:10.3390/s23041834","is_oa":true,"landing_page_url":"https://doi.org/10.3390/s23041834","pdf_url":"https://www.mdpi.com/1424-8220/23/4/1834/pdf?version=1675753549","source":{"id":"https://openalex.org/S101949793","display_name":"Sensors","issn_l":"1424-8220","issn":["1424-8220"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Sensors","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj","pubmed"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.mdpi.com/1424-8220/23/4/1834/pdf?version=1675753549","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101865953","display_name":"Yibo He","orcid":"https://orcid.org/0000-0002-6306-0647"},"institutions":[{"id":"https://openalex.org/I69356397","display_name":"Xi\u2019an Jiaotong-Liverpool University","ror":"https://ror.org/03zmrmn05","country_code":"CN","type":"education","lineage":["https://openalex.org/I69356397"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yibo He","raw_affiliation_strings":["School of AI and Advanced Computing, Xian Jiaotong Liverpool University, Suzhou 215123, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of AI and Advanced Computing, Xian Jiaotong Liverpool University, Suzhou 215123, China","institution_ids":["https://openalex.org/I69356397"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111525101","display_name":"Kah Phooi Seng","orcid":null},"institutions":[{"id":"https://openalex.org/I160993911","display_name":"Queensland University of Technology","ror":"https://ror.org/03pnv4752","country_code":"AU","type":"education","lineage":["https://openalex.org/I160993911"]},{"id":"https://openalex.org/I69356397","display_name":"Xi\u2019an Jiaotong-Liverpool University","ror":"https://ror.org/03zmrmn05","country_code":"CN","type":"education","lineage":["https://openalex.org/I69356397"]}],"countries":["AU","CN"],"is_corresponding":true,"raw_author_name":"Kah Phooi Seng","raw_affiliation_strings":["School of AI and Advanced Computing, Xian Jiaotong Liverpool University, Suzhou 215123, China","School of Computer Science, Queensland University of Technology, Brisbane, QLD 4000, Australia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of AI and Advanced Computing, Xian Jiaotong Liverpool University, Suzhou 215123, China","institution_ids":["https://openalex.org/I69356397"]},{"raw_affiliation_string":"School of Computer Science, Queensland University of Technology, Brisbane, QLD 4000, Australia","institution_ids":["https://openalex.org/I160993911"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5073635669","display_name":"Li-Minn Ang","orcid":"https://orcid.org/0000-0002-2402-7529"},"institutions":[{"id":"https://openalex.org/I174025329","display_name":"University of the Sunshine Coast","ror":"https://ror.org/016gb9e15","country_code":"AU","type":"education","lineage":["https://openalex.org/I174025329"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Li Minn Ang","raw_affiliation_strings":["School of Science, Technology and Engineering, University of Sunshine Coast, Sippy Downs, QLD 4502, Australia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Science, Technology and Engineering, University of Sunshine Coast, Sippy Downs, QLD 4502, Australia","institution_ids":["https://openalex.org/I174025329"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5111525101"],"corresponding_institution_ids":["https://openalex.org/I160993911","https://openalex.org/I69356397"],"apc_list":{"value":2400,"currency":"CHF","value_usd":2598},"apc_paid":{"value":2400,"currency":"CHF","value_usd":2598},"fwci":3.3699,"has_fulltext":true,"cited_by_count":18,"citation_normalized_percentile":{"value":0.93245755,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":"23","issue":"4","first_page":"1834","last_page":"1834"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.987500011920929,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.770784854888916},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.734637975692749},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.47795653343200684},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.47306740283966064},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.4706265926361084},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.44501838088035583},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.324873685836792}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.770784854888916},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.734637975692749},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.47795653343200684},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.47306740283966064},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.4706265926361084},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.44501838088035583},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.324873685836792},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[{"descriptor_ui":"D000077321","descriptor_name":"Deep Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000077321","descriptor_name":"Deep Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D000077321","descriptor_name":"Deep Learning","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D007802","descriptor_name":"Language","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D007802","descriptor_name":"Language","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D007802","descriptor_name":"Language","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D013060","descriptor_name":"Speech","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D013060","descriptor_name":"Speech","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D013060","descriptor_name":"Speech","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D013067","descriptor_name":"Speech Perception","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D013067","descriptor_name":"Speech Perception","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D013067","descriptor_name":"Speech Perception","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true}],"locations_count":5,"locations":[{"id":"doi:10.3390/s23041834","is_oa":true,"landing_page_url":"https://doi.org/10.3390/s23041834","pdf_url":"https://www.mdpi.com/1424-8220/23/4/1834/pdf?version=1675753549","source":{"id":"https://openalex.org/S101949793","display_name":"Sensors","issn_l":"1424-8220","issn":["1424-8220"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Sensors","raw_type":"journal-article"},{"id":"pmid:36850432","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/36850432","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Sensors (Basel, Switzerland)","raw_type":null},{"id":"pmh:oai:pubmedcentral.nih.gov:9959127","is_oa":true,"landing_page_url":"https://www.ncbi.nlm.nih.gov/pmc/articles/9959127","pdf_url":"https://pmc.ncbi.nlm.nih.gov/articles/PMC9959127/pdf/sensors-23-01834.pdf","source":{"id":"https://openalex.org/S2764455111","display_name":"PubMed Central","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Sensors (Basel)","raw_type":"Text"},{"id":"pmh:oai:doaj.org/article:c61a2a5a15f24d50be1b6f58a7201463","is_oa":true,"landing_page_url":"https://doaj.org/article/c61a2a5a15f24d50be1b6f58a7201463","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Sensors, Vol 23, Iss 4, p 1834 (2023)","raw_type":"article"},{"id":"pmh:oai:mdpi.com:/1424-8220/23/4/1834/","is_oa":true,"landing_page_url":"https://dx.doi.org/10.3390/s23041834","pdf_url":null,"source":{"id":"https://openalex.org/S4306400947","display_name":"MDPI (MDPI AG)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210097602","host_organization_name":"Multidisciplinary Digital Publishing Institute (Switzerland)","host_organization_lineage":["https://openalex.org/I4210097602"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Sensors","raw_type":"Text"}],"best_oa_location":{"id":"doi:10.3390/s23041834","is_oa":true,"landing_page_url":"https://doi.org/10.3390/s23041834","pdf_url":"https://www.mdpi.com/1424-8220/23/4/1834/pdf?version=1675753549","source":{"id":"https://openalex.org/S101949793","display_name":"Sensors","issn_l":"1424-8220","issn":["1424-8220"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Sensors","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.6200000047683716,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4319442337.pdf"},"referenced_works_count":25,"referenced_works":["https://openalex.org/W1974783905","https://openalex.org/W2103967578","https://openalex.org/W2121486117","https://openalex.org/W2144003766","https://openalex.org/W2161078696","https://openalex.org/W2551572271","https://openalex.org/W2606977969","https://openalex.org/W2611905122","https://openalex.org/W2749066063","https://openalex.org/W2782689936","https://openalex.org/W2884454321","https://openalex.org/W2890952074","https://openalex.org/W2895006884","https://openalex.org/W2912581782","https://openalex.org/W2963499843","https://openalex.org/W2964109005","https://openalex.org/W2980393359","https://openalex.org/W3015383493","https://openalex.org/W3044285380","https://openalex.org/W3081492798","https://openalex.org/W3101631197","https://openalex.org/W3160554450","https://openalex.org/W3186343939","https://openalex.org/W6675333858","https://openalex.org/W6683878071"],"related_works":["https://openalex.org/W2167155152","https://openalex.org/W2136763963","https://openalex.org/W2404514746","https://openalex.org/W2109705048","https://openalex.org/W2940588515","https://openalex.org/W1652783584","https://openalex.org/W1909151225","https://openalex.org/W2160030256","https://openalex.org/W1521297879","https://openalex.org/W4253235840"],"abstract_inverted_index":{"This":[0],"paper":[1],"investigates":[2],"multimodal":[3],"sensor":[4],"architectures":[5,194,200],"with":[6,164],"deep":[7],"learning":[8],"for":[9,27,78,90,178],"audio-visual":[10,196],"speech":[11,36,191,197],"recognition,":[12],"focusing":[13],"on":[14],"in-the-wild":[15,63],"scenarios.":[16],"The":[17,150],"term":[18],"\"in":[19],"the":[20,92,96,123,139,147,157,165,169,172,176,187,207],"wild\"":[21],"is":[22,39,71],"used":[23,126],"to":[24,85],"describe":[25],"AVSR":[26,79,159,173,208],"unconstrained":[28],"natural-language":[29],"audio":[30,47],"streams":[31],"and":[32,53,108,136,142,152,161,195,201],"video-stream":[33],"modalities.":[34],"Audio-visual":[35],"recognition":[37,192,198],"(AVSR)":[38,199],"a":[40,50,101,203],"speech-recognition":[41],"task":[42],"that":[43,156,210],"leverages":[44],"both":[45],"an":[46,54],"input":[48,57],"of":[49,58,103,171,189,206],"human":[51],"voice":[52],"aligned":[55],"visual":[56],"lip":[59],"motions.":[60],"However,":[61],"since":[62],"scenarios":[64],"can":[65],"include":[66],"more":[67,87],"noise,":[68],"AVSR's":[69],"performance":[70,170],"affected.":[72],"Here,":[73],"we":[74,99,125,185],"propose":[75],"new":[76],"improvements":[77],"models":[80,209],"by":[81],"incorporating":[82],"data-augmentation":[83,97],"techniques":[84],"generate":[86],"data":[88,128],"samples":[89],"building":[91],"classification":[93],"models.":[94],"For":[95],"techniques,":[98],"utilized":[100],"combination":[102],"conventional":[104],"approaches":[105],"(e.g.,":[106],"flips":[107],"rotations),":[109],"as":[110,112,116],"well":[111],"newer":[113],"approaches,":[114,124],"such":[115],"generative":[117],"adversarial":[118],"networks":[119],"(GANs).":[120],"To":[121],"validate":[122],"augmented":[127],"from":[129],"well-known":[130],"datasets":[131],"(LRS2-Lip":[132],"Reading":[133],"Sentences":[134],"2":[135],"LRS3)":[137],"in":[138,175,182],"training":[140],"process":[141],"testing":[143],"was":[144],"performed":[145],"using":[146],"original":[148],"data.":[149],"study":[151],"experimental":[153],"results":[154],"indicated":[155],"proposed":[158],"model":[160],"framework,":[162],"combined":[163],"augmentation":[166],"approach,":[167],"enhanced":[168],"framework":[174],"wild":[177],"noisy":[179],"datasets.":[180],"Furthermore,":[181],"this":[183],"study,":[184],"discuss":[186],"domains":[188],"automatic":[190],"(ASR)":[193],"give":[202],"concise":[204],"summary":[205],"have":[211],"been":[212],"proposed.":[213]},"counts_by_year":[{"year":2026,"cited_by_count":5},{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":2}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
