{"id":"https://openalex.org/W4406461808","doi":"https://doi.org/10.1109/slt61566.2024.10832305","title":"Learning Video Temporal Dynamics With Cross-Modal Attention For Robust Audio-Visual Speech Recognition","display_name":"Learning Video Temporal Dynamics With Cross-Modal Attention For Robust Audio-Visual Speech Recognition","publication_year":2024,"publication_date":"2024-12-02","ids":{"openalex":"https://openalex.org/W4406461808","doi":"https://doi.org/10.1109/slt61566.2024.10832305"},"language":"en","primary_location":{"id":"doi:10.1109/slt61566.2024.10832305","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832305","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5048886717","display_name":"Sungnyun Kim","orcid":"https://orcid.org/0000-0002-3251-1812"},"institutions":[{"id":"https://openalex.org/I3130020584","display_name":"International Graduate School of English","ror":"https://ror.org/0181xpd52","country_code":"KR","type":"education","lineage":["https://openalex.org/I3130020584"]},{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Sungnyun Kim","raw_affiliation_strings":["KAIST,Kim Jaechul Graduate School of AI"],"affiliations":[{"raw_affiliation_string":"KAIST,Kim Jaechul Graduate School of AI","institution_ids":["https://openalex.org/I3130020584","https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082991654","display_name":"Kangwook Jang","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Kangwook Jang","raw_affiliation_strings":["KAIST,School of Electrical Engineering"],"affiliations":[{"raw_affiliation_string":"KAIST,School of Electrical Engineering","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112425608","display_name":"Sangmin Bae","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]},{"id":"https://openalex.org/I3130020584","display_name":"International Graduate School of English","ror":"https://ror.org/0181xpd52","country_code":"KR","type":"education","lineage":["https://openalex.org/I3130020584"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Sangmin Bae","raw_affiliation_strings":["KAIST,Kim Jaechul Graduate School of AI"],"affiliations":[{"raw_affiliation_string":"KAIST,Kim Jaechul Graduate School of AI","institution_ids":["https://openalex.org/I3130020584","https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066573438","display_name":"Hoirin Kim","orcid":null},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Hoirin Kim","raw_affiliation_strings":["KAIST,School of Electrical Engineering"],"affiliations":[{"raw_affiliation_string":"KAIST,School of Electrical Engineering","institution_ids":["https://openalex.org/I157485424"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5091674853","display_name":"Se-Young Yun","orcid":"https://orcid.org/0000-0001-6675-5113"},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]},{"id":"https://openalex.org/I3130020584","display_name":"International Graduate School of English","ror":"https://ror.org/0181xpd52","country_code":"KR","type":"education","lineage":["https://openalex.org/I3130020584"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Se-Young Yun","raw_affiliation_strings":["KAIST,Kim Jaechul Graduate School of AI"],"affiliations":[{"raw_affiliation_string":"KAIST,Kim Jaechul Graduate School of AI","institution_ids":["https://openalex.org/I3130020584","https://openalex.org/I157485424"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5048886717"],"corresponding_institution_ids":["https://openalex.org/I157485424","https://openalex.org/I3130020584"],"apc_list":null,"apc_paid":null,"fwci":1.1681,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.8022508,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"447","last_page":"454"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7630938291549683},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7326904535293579},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.6055516600608826},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.58174729347229},{"id":"https://openalex.org/keywords/dynamics","display_name":"Dynamics (music)","score":0.5360206961631775},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.41297999024391174},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.2536148428916931},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.12475991249084473}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7630938291549683},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7326904535293579},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.6055516600608826},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.58174729347229},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.5360206961631775},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41297999024391174},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.2536148428916931},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.12475991249084473},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/slt61566.2024.10832305","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832305","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320320671","display_name":"National Research Foundation","ror":"https://ror.org/05s0g1g46"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":43,"referenced_works":["https://openalex.org/W1503933356","https://openalex.org/W2076462394","https://openalex.org/W2219249508","https://openalex.org/W2551572271","https://openalex.org/W2557227117","https://openalex.org/W2781335552","https://openalex.org/W2792764867","https://openalex.org/W2795087793","https://openalex.org/W2808631503","https://openalex.org/W2890952074","https://openalex.org/W2891205112","https://openalex.org/W2963654155","https://openalex.org/W2986833982","https://openalex.org/W3035042697","https://openalex.org/W3162293946","https://openalex.org/W3167917117","https://openalex.org/W3205533980","https://openalex.org/W4297841411","https://openalex.org/W4297841641","https://openalex.org/W4308236834","https://openalex.org/W4319300051","https://openalex.org/W4372260292","https://openalex.org/W4372342491","https://openalex.org/W4372346152","https://openalex.org/W4382202533","https://openalex.org/W4382237564","https://openalex.org/W4385245566","https://openalex.org/W4385571549","https://openalex.org/W4385571863","https://openalex.org/W4385767994","https://openalex.org/W4391021811","https://openalex.org/W4392903321","https://openalex.org/W4392903825","https://openalex.org/W4392908934","https://openalex.org/W4393158550","https://openalex.org/W6688816777","https://openalex.org/W6749825310","https://openalex.org/W6754420807","https://openalex.org/W6781364056","https://openalex.org/W6810168380","https://openalex.org/W6839936984","https://openalex.org/W6840129460","https://openalex.org/W6847652939"],"related_works":["https://openalex.org/W2271369634","https://openalex.org/W3147472394","https://openalex.org/W2047100085","https://openalex.org/W2350550760","https://openalex.org/W578794879","https://openalex.org/W2625296515","https://openalex.org/W3137890128","https://openalex.org/W1984634519","https://openalex.org/W4245955731","https://openalex.org/W2393726419"],"abstract_inverted_index":{"Audio-visual":[0],"speech":[1,8,86,129,137],"recognition":[2],"(AVSR)":[3],"aims":[4],"to":[5,77,134],"transcribe":[6],"human":[7],"using":[9],"both":[10],"audio":[11,36,82],"and":[12,66,112,128,167],"video":[13,24,44,52,60,70,79,97,148],"modalities.":[14],"In":[15,46],"practical":[16],"environments":[17],"with":[18,81],"noise-corrupted":[19],"audio,":[20],"the":[21,41,51,67,96,106,110,117,132,136,147,152,159,163,168],"role":[22],"of":[23,43,69,154],"information":[25,83],"becomes":[26],"crucial.":[27],"However,":[28],"prior":[29],"works":[30],"have":[31],"primarily":[32],"focused":[33],"on":[34,95,101,109],"enhancing":[35],"features":[37,53,80],"in":[38,59,123,146],"AVSR,":[39],"overlooking":[40],"importance":[42],"features.":[45],"this":[47],"study,":[48],"we":[49,104],"strengthen":[50],"by":[54,157],"learning":[55],"three":[56],"temporal":[57,98,164],"dynamics":[58,165],"data:":[61],"context":[62],"order,":[63],"playback":[64],"direction,":[65],"speed":[68],"frames.":[71],"Cross-modal":[72],"attention":[73,170],"modules":[74],"are":[75],"introduced":[76],"enrich":[78],"so":[84],"that":[85,139],"variability":[87],"can":[88],"be":[89,141],"taken":[90],"into":[91],"account":[92],"when":[93],"training":[94],"dynamics.":[99],"Based":[100],"our":[102,155],"approach,":[103],"achieve":[105],"state-of-the-art":[107],"performance":[108],"LRS2":[111],"LRS3":[113],"AVSR":[114],"benchmarks":[115],"for":[116,126,162],"noise-dominant":[118],"settings.":[119],"Our":[120],"approach":[121],"excels":[122],"scenarios":[124],"especially":[125],"babble":[127],"noise,":[130],"indicating":[131],"ability":[133],"distinguish":[135],"signal":[138],"should":[140],"recognized":[142],"from":[143],"lip":[144],"movements":[145],"modality.":[149],"We":[150],"support":[151],"validity":[153],"methodology":[156],"offering":[158],"ablation":[160],"experiments":[161],"losses":[166],"cross-modal":[169],"architecture":[171],"design.":[172]},"counts_by_year":[{"year":2025,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
