{"id":"https://openalex.org/W4392909863","doi":"https://doi.org/10.1109/icassp48485.2024.10446142","title":"Multi-Modality Speech Recognition Driven by Background Visual Scenes","display_name":"Multi-Modality Speech Recognition Driven by Background Visual Scenes","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392909863","doi":"https://doi.org/10.1109/icassp48485.2024.10446142"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446142","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446142","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101459018","display_name":"Cheng Luo","orcid":"https://orcid.org/0000-0002-2797-3641"},"institutions":[{"id":"https://openalex.org/I4210123185","display_name":"Zhejiang Lab","ror":"https://ror.org/02m2h7991","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210123185"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Cheng Luo","raw_affiliation_strings":["Zhejiang Lab,Hangzhou,China","Zhejiang Lab, Hangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Zhejiang Lab,Hangzhou,China","institution_ids":["https://openalex.org/I4210123185"]},{"raw_affiliation_string":"Zhejiang Lab, Hangzhou, China","institution_ids":["https://openalex.org/I4210123185"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101780712","display_name":"Yiguang Liu","orcid":"https://orcid.org/0000-0002-6115-0921"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yiguang Liu","raw_affiliation_strings":["Zhejiang University,Hangzhou,China","Zhejiang University, Hangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Zhejiang University,Hangzhou,China","institution_ids":["https://openalex.org/I76130692"]},{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065076533","display_name":"Wenhui Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I4210123185","display_name":"Zhejiang Lab","ror":"https://ror.org/02m2h7991","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210123185"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenhui Sun","raw_affiliation_strings":["Zhejiang Lab,Hangzhou,China","Zhejiang Lab, Hangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Zhejiang Lab,Hangzhou,China","institution_ids":["https://openalex.org/I4210123185"]},{"raw_affiliation_string":"Zhejiang Lab, Hangzhou, China","institution_ids":["https://openalex.org/I4210123185"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5003191324","display_name":"Zhoujian Sun","orcid":"https://orcid.org/0000-0002-5384-9695"},"institutions":[{"id":"https://openalex.org/I4210123185","display_name":"Zhejiang Lab","ror":"https://ror.org/02m2h7991","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210123185"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhoujian Sun","raw_affiliation_strings":["Zhejiang Lab,Hangzhou,China","Zhejiang Lab, Hangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Zhejiang Lab,Hangzhou,China","institution_ids":["https://openalex.org/I4210123185"]},{"raw_affiliation_string":"Zhejiang Lab, Hangzhou, China","institution_ids":["https://openalex.org/I4210123185"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.3057,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.46245164,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"10926","last_page":"10930"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9685999751091003,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9685999751091003,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9218000173568726,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.7190594673156738},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7106975317001343},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5479335188865662},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.52543044090271},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.38799917697906494},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.36050665378570557}],"concepts":[{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.7190594673156738},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7106975317001343},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5479335188865662},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.52543044090271},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.38799917697906494},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.36050665378570557}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446142","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446142","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/5","display_name":"Gender equality","score":0.4699999988079071}],"awards":[{"id":"https://openalex.org/G1985370426","display_name":null,"funder_award_id":"2021ZD0201501","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"},{"id":"https://openalex.org/G8060470696","display_name":null,"funder_award_id":"32200860","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null},{"id":"https://openalex.org/F4320337504","display_name":"Research and Development","ror":"https://ror.org/027s68j25"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W1882423120","https://openalex.org/W2014621385","https://openalex.org/W2032337854","https://openalex.org/W2069620637","https://openalex.org/W2073227393","https://openalex.org/W2099940393","https://openalex.org/W2594690981","https://openalex.org/W2890952074","https://openalex.org/W2891205112","https://openalex.org/W2991924442","https://openalex.org/W2995028981","https://openalex.org/W3009620048","https://openalex.org/W3016011581","https://openalex.org/W3081788987","https://openalex.org/W3162293946","https://openalex.org/W4221153068","https://openalex.org/W4223499953","https://openalex.org/W4223533877","https://openalex.org/W4297841641","https://openalex.org/W4319300051","https://openalex.org/W4385245566","https://openalex.org/W6754420807","https://openalex.org/W6810168380"],"related_works":["https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2772917594","https://openalex.org/W2775347418","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Visual":[0],"information":[1,21,45,175,199],"is":[2],"often":[3],"used":[4],"as":[5,78],"a":[6,62,121],"complementary":[7],"cue":[8],"for":[9,136,165],"automatic":[10,51,137,206],"speech":[11,35,52,138,207],"recognition":[12,31,36],"in":[13,57,110,190,205],"noisy":[14,191],"environments.":[15,192],"Most":[16],"previous":[17],"studies":[18],"utilize":[19],"visual":[20,44,104,134,174,198],"of":[22,33,46,82,85,106,131],"target":[23,158],"speakers":[24],"(e.g.,":[25],"lip":[26],"movements)":[27],"to":[28,160,188,202],"improve":[29],"the":[30,145,152,166],"performance":[32,183,204],"audio-visual":[34],"(AVSR)":[37],"models.":[38],"However,":[39],"it":[40],"remains":[41],"unclear":[42],"whether":[43],"background":[47,108,149,178],"sound":[48],"can":[49,200],"benefit":[50],"recognition.":[53,139,208],"Our":[54],"study":[55],"proceeds":[56],"this":[58,141],"regard":[59],"by":[60,186],"constructing":[61],"new":[63,72],"audiovisual":[64],"dataset":[65,100,147,156],"and":[66,91,103,133],"devising":[67],"an":[68],"AVSR":[69,115,167],"model.":[70,168],"The":[71,98,114,169],"dataset,":[73,80],"Audio-Visual":[74],"Natural":[75],"Scenes":[76],"(abbreviated":[77],"AVNS)":[79],"consists":[81],"11":[83],"types":[84],"natural":[86,111],"scenes":[87],"(around":[88],"31.3":[89],"hours)":[90],"was":[92,117],"recorded":[93],"through":[94],"professional":[95],"recording":[96],"devices.":[97],"AVNS":[99,146],"provides":[101],"audio":[102,132],"signals":[105],"common":[107],"noises":[109,179],"acoustic":[112],"scenes.":[113],"model":[116,182,203],"designed":[118],"based":[119],"on":[120],"representation":[122],"learning":[123],"framework":[124],"called":[125],"AV-HuBERT,":[126],"which":[127],"could":[128],"fuse":[129],"representations":[130],"modalities":[135],"In":[140],"work,":[142],"we":[143],"combined":[144],"(providing":[148,157],"sound)":[150],"with":[151,177],"largest":[153],"benchmark":[154],"LRS3":[155],"speech)":[159],"create":[161],"adverse":[162],"noise":[163],"conditions":[164],"results":[170],"showed":[171],"that":[172,196],"incorporating":[173],"synchronized":[176],"greatly":[180],"improved":[181],"(reducing":[184],"WER":[185],"up":[187],"4.9%)":[189],"These":[193],"findings":[194],"demonstrate":[195],"noise-related":[197],"contribute":[201]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
