{"id":"https://openalex.org/W4308236834","doi":"https://doi.org/10.1109/icip46576.2022.9897235","title":"Learning Contextually Fused Audio-Visual Representations For Audio-Visual Speech Recognition","display_name":"Learning Contextually Fused Audio-Visual Representations For Audio-Visual Speech Recognition","publication_year":2022,"publication_date":"2022-10-16","ids":{"openalex":"https://openalex.org/W4308236834","doi":"https://doi.org/10.1109/icip46576.2022.9897235"},"language":"en","primary_location":{"id":"doi:10.1109/icip46576.2022.9897235","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icip46576.2022.9897235","pdf_url":null,"source":{"id":"https://openalex.org/S4363607719","display_name":"2022 IEEE International Conference on Image Processing (ICIP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 IEEE International Conference on Image Processing (ICIP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101577318","display_name":"Ziqiang Zhang","orcid":"https://orcid.org/0000-0003-0110-1543"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zi-Qiang Zhang","raw_affiliation_strings":["University of Science and Technology of China (USTC),Nel-Slip,Hefei,China","Nel-Slip, University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),Nel-Slip,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"Nel-Slip, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100436848","display_name":"Jie Zhang","orcid":"https://orcid.org/0000-0003-1124-0854"},"institutions":[{"id":"https://openalex.org/I4210099069","display_name":"Institute of Acoustics","ror":"https://ror.org/00v8rqv75","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210099069"]},{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jie Zhang","raw_affiliation_strings":["University of Science and Technology of China (USTC),Nel-Slip,Hefei,China","iFlytek Co., Ltd.,iFlytek Research,Hefei,China","State Key Laboratory of Acoustics, Institute of Acoustics, Chinese Academy of Sciences, Beijing, China","Nel-Slip, University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),Nel-Slip,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"iFlytek Co., Ltd.,iFlytek Research,Hefei,China","institution_ids":[]},{"raw_affiliation_string":"State Key Laboratory of Acoustics, Institute of Acoustics, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210099069","https://openalex.org/I19820366"]},{"raw_affiliation_string":"Nel-Slip, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012647109","display_name":"Jian-Shu Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jian-Shu Zhang","raw_affiliation_strings":["University of Science and Technology of China (USTC),Nel-Slip,Hefei,China","iFlytek Co., Ltd.,iFlytek Research,Hefei,China","iFlytek Research, iFlytek Co., Ltd., Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),Nel-Slip,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"iFlytek Co., Ltd.,iFlytek Research,Hefei,China","institution_ids":[]},{"raw_affiliation_string":"iFlytek Research, iFlytek Co., Ltd., Hefei, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101925641","display_name":"Minghui Wu","orcid":"https://orcid.org/0000-0001-8179-7119"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ming-Hui Wu","raw_affiliation_strings":["University of Science and Technology of China (USTC),Nel-Slip,Hefei,China","Nel-Slip, University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),Nel-Slip,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"Nel-Slip, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101716884","display_name":"Xin Fang","orcid":"https://orcid.org/0000-0003-4796-9444"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xin Fang","raw_affiliation_strings":["University of Science and Technology of China (USTC),Nel-Slip,Hefei,China","Nel-Slip, University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),Nel-Slip,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"Nel-Slip, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5057227915","display_name":"Li-Rong Dai","orcid":"https://orcid.org/0000-0002-0859-2827"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Li-Rong Dai","raw_affiliation_strings":["University of Science and Technology of China (USTC),Nel-Slip,Hefei,China","Nel-Slip, University of Science and Technology of China (USTC), Hefei, China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China (USTC),Nel-Slip,Hefei,China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"Nel-Slip, University of Science and Technology of China (USTC), Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5101577318"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.8603,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.74439179,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"1346","last_page":"1350"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.9937000274658203,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7571506500244141},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7141065001487732},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.601642906665802},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.4821719527244568},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4693003296852112},{"id":"https://openalex.org/keywords/generality","display_name":"Generality","score":0.4592222571372986},{"id":"https://openalex.org/keywords/multimodal-learning","display_name":"Multimodal learning","score":0.43265342712402344},{"id":"https://openalex.org/keywords/audio-mining","display_name":"Audio mining","score":0.42597317695617676},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.411733478307724},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.26670461893081665},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.18690750002861023},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.15683972835540771},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.0825280249118805}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7571506500244141},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7141065001487732},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.601642906665802},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.4821719527244568},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4693003296852112},{"id":"https://openalex.org/C2780767217","wikidata":"https://www.wikidata.org/wiki/Q5532421","display_name":"Generality","level":2,"score":0.4592222571372986},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.43265342712402344},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.42597317695617676},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.411733478307724},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.26670461893081665},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.18690750002861023},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.15683972835540771},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.0825280249118805},{"id":"https://openalex.org/C542102704","wikidata":"https://www.wikidata.org/wiki/Q183257","display_name":"Psychotherapist","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icip46576.2022.9897235","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icip46576.2022.9897235","pdf_url":null,"source":{"id":"https://openalex.org/S4363607719","display_name":"2022 IEEE International Conference on Image Processing (ICIP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2022 IEEE International Conference on Image Processing (ICIP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5699999928474426}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":50,"referenced_works":["https://openalex.org/W2015394094","https://openalex.org/W2127141656","https://openalex.org/W2194775991","https://openalex.org/W2511428026","https://openalex.org/W2594690981","https://openalex.org/W2619697695","https://openalex.org/W2889624961","https://openalex.org/W2890052321","https://openalex.org/W2890952074","https://openalex.org/W2891205112","https://openalex.org/W2896457183","https://openalex.org/W2933138175","https://openalex.org/W2950864153","https://openalex.org/W2962960500","https://openalex.org/W2963654155","https://openalex.org/W2973049979","https://openalex.org/W2990408345","https://openalex.org/W3016011581","https://openalex.org/W3036601975","https://openalex.org/W3047388929","https://openalex.org/W3048939150","https://openalex.org/W3099638501","https://openalex.org/W3102469722","https://openalex.org/W3113320078","https://openalex.org/W3119418740","https://openalex.org/W3159929162","https://openalex.org/W3161904430","https://openalex.org/W3162293946","https://openalex.org/W3162707322","https://openalex.org/W3171345413","https://openalex.org/W3175300676","https://openalex.org/W3190580390","https://openalex.org/W3197567540","https://openalex.org/W3198659451","https://openalex.org/W3209059054","https://openalex.org/W4221153068","https://openalex.org/W4287726044","https://openalex.org/W4307823382","https://openalex.org/W4385245566","https://openalex.org/W6739901393","https://openalex.org/W6754048563","https://openalex.org/W6754420807","https://openalex.org/W6755207826","https://openalex.org/W6770805772","https://openalex.org/W6780218876","https://openalex.org/W6780637495","https://openalex.org/W6781364056","https://openalex.org/W6787473416","https://openalex.org/W6794788993","https://openalex.org/W6810168380"],"related_works":["https://openalex.org/W2045049461","https://openalex.org/W4381094582","https://openalex.org/W1978893398","https://openalex.org/W3157841754","https://openalex.org/W4381827277","https://openalex.org/W4390136517","https://openalex.org/W3167558523","https://openalex.org/W3120825179","https://openalex.org/W2999894541","https://openalex.org/W2014028898"],"abstract_inverted_index":{"With":[0],"the":[1,28,35,70,91,124,145],"advance":[2],"in":[3,42,123],"self-supervised":[4,50],"learning":[5,52,63],"for":[6,26,54],"audio":[7,55],"and":[8,75,84,136],"visual":[9],"modalities,":[10,143],"it":[11,106],"has":[12],"become":[13],"possible":[14],"to":[15,95,110],"learn":[16],"a":[17,80,85],"robust":[18],"audio-visual":[19,29,61,73],"speech":[20,30,115,134],"representation.":[21],"This":[22],"would":[23],"be":[24,108],"beneficial":[25],"improving":[27],"recognition":[31,116,135],"(AVSR)":[32],"performance,":[33],"as":[34],"multi-modal":[36],"inputs":[37],"contain":[38],"more":[39],"fruitful":[40],"information":[41],"principle.":[43],"In":[44],"this":[45],"paper,":[46],"based":[47],"on":[48,133],"existing":[49],"representation":[51,62],"methods":[53],"modality,":[56],"we":[57],"therefore":[58],"propose":[59],"an":[60],"approach.":[64],"The":[65,127],"proposed":[66,128],"approach":[67],"explores":[68],"both":[69],"complementarity":[71],"of":[72,104],"modalities":[74],"long-term":[76],"context":[77],"dependency":[78],"using":[79,139],"transformer-based":[81],"fusion":[82,125],"module":[83],"flexible":[86],"masking":[87,119],"strategy.":[88],"After":[89],"pre-training,":[90],"model":[92,130],"is":[93,131,147],"able":[94],"extract":[96],"fused":[97],"representations":[98],"required":[99],"by":[100,117],"AVSR.":[101],"Without":[102],"loss":[103],"generality,":[105],"can":[107],"applied":[109],"single-modal":[111],"tasks,":[112],"e.g.,":[113],"audio/visual":[114],"simply":[118],"out":[120],"one":[121,140],"modality":[122],"module.":[126],"pre-trained":[129],"evaluated":[132],"lipreading":[137],"tasks":[138],"or":[141],"two":[142],"where":[144],"superiority":[146],"revealed.":[148]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
