{"id":"https://openalex.org/W4415708654","doi":"https://doi.org/10.1109/icme59968.2025.11209403","title":"Enhanced Self-Supervised Multi-View Representations with Modality-Missing Robustness for Audio-Visual Speech Recognition","display_name":"Enhanced Self-Supervised Multi-View Representations with Modality-Missing Robustness for Audio-Visual Speech Recognition","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4415708654","doi":"https://doi.org/10.1109/icme59968.2025.11209403"},"language":null,"primary_location":{"id":"doi:10.1109/icme59968.2025.11209403","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11209403","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052100570","display_name":"Fei Su","orcid":"https://orcid.org/0000-0002-3529-9845"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Fei Su","raw_affiliation_strings":["Wuhan University,School of Computer Science,Wuhan,China,430072"],"affiliations":[{"raw_affiliation_string":"Wuhan University,School of Computer Science,Wuhan,China,430072","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113355139","display_name":"Cancan Li","orcid":"https://orcid.org/0009-0002-2537-6293"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Cancan Li","raw_affiliation_strings":["Wuhan University,School of Computer Science,Wuhan,China,430072"],"affiliations":[{"raw_affiliation_string":"Wuhan University,School of Computer Science,Wuhan,China,430072","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100361707","display_name":"Jun Liu","orcid":"https://orcid.org/0000-0001-8859-5405"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Juan Liu","raw_affiliation_strings":["Wuhan University,School of Artificial Intelligence,Wuhan,China,430072"],"affiliations":[{"raw_affiliation_string":"Wuhan University,School of Artificial Intelligence,Wuhan,China,430072","institution_ids":["https://openalex.org/I37461747"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5052100570"],"corresponding_institution_ids":["https://openalex.org/I37461747"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.42460505,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9807999730110168,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9807999730110168,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.008200000040233135,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.003700000001117587,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6371999979019165},{"id":"https://openalex.org/keywords/generalizability-theory","display_name":"Generalizability theory","score":0.5396000146865845},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.453000009059906},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.42579999566078186},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3813999891281128},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.38109999895095825},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.3100999891757965},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.29350000619888306}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7581999897956848},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6478000283241272},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6371999979019165},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5623000264167786},{"id":"https://openalex.org/C27158222","wikidata":"https://www.wikidata.org/wiki/Q5532422","display_name":"Generalizability theory","level":2,"score":0.5396000146865845},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.453000009059906},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.42579999566078186},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3813999891281128},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.38109999895095825},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.31130000948905945},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3100999891757965},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.29350000619888306},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.27950000762939453},{"id":"https://openalex.org/C2780312720","wikidata":"https://www.wikidata.org/wiki/Q5689100","display_name":"Head (geology)","level":2,"score":0.271699994802475},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2709999978542328},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.27079999446868896},{"id":"https://openalex.org/C60692881","wikidata":"https://www.wikidata.org/wiki/Q584529","display_name":"Humanoid robot","level":3,"score":0.2694999873638153},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.2687000036239624},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.2621999979019165},{"id":"https://openalex.org/C116409475","wikidata":"https://www.wikidata.org/wiki/Q1385056","display_name":"External Data Representation","level":2,"score":0.25519999861717224},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2549999952316284},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.2529999911785126}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme59968.2025.11209403","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11209403","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W1526392145","https://openalex.org/W1549662771","https://openalex.org/W1935685005","https://openalex.org/W2016067832","https://openalex.org/W2106284211","https://openalex.org/W2157190406","https://openalex.org/W2237250383","https://openalex.org/W2301937176","https://openalex.org/W2511073133","https://openalex.org/W2548498729","https://openalex.org/W2551572271","https://openalex.org/W2755403114","https://openalex.org/W2769666294","https://openalex.org/W2806833697","https://openalex.org/W2889050557","https://openalex.org/W2890952074","https://openalex.org/W2901907199","https://openalex.org/W2935794029","https://openalex.org/W2963528589","https://openalex.org/W2972756321","https://openalex.org/W2997909293","https://openalex.org/W3006974783","https://openalex.org/W3015830103","https://openalex.org/W3109585842","https://openalex.org/W3114214226","https://openalex.org/W3152754274","https://openalex.org/W3162293946","https://openalex.org/W3167917117","https://openalex.org/W3197567540","https://openalex.org/W3206835951","https://openalex.org/W4297841641","https://openalex.org/W4307286264","https://openalex.org/W4312578889","https://openalex.org/W4402112360","https://openalex.org/W4404792841","https://openalex.org/W4408354236"],"related_works":[],"abstract_inverted_index":{"Audio-Visual":[0],"Speech":[1],"Recognition":[2],"(AVSR)":[3],"leverages":[4],"visual":[5,27,82],"information":[6],"to":[7,47,75,77],"enhance":[8],"speech":[9],"understanding.":[10],"However,":[11],"current":[12],"models":[13],"assume":[14],"stable,":[15],"frontal":[16],"viewpoints,":[17],"suffering":[18],"significant":[19],"performance":[20,79],"drops":[21],"with":[22,91],"non-frontal":[23],"angles":[24],"or":[25],"when":[26,81],"input":[28],"is":[29],"missing.":[30],"Our":[31],"approach":[32,97,130],"employs":[33],"a":[34,54,67,109],"multi-view":[35,56,133],"data":[36,46],"generation":[37],"strategy":[38],"using":[39],"3D":[40],"head":[41,50],"avatar":[42],"reconstruction,":[43],"synthesizing":[44],"viewpoint-diverse":[45],"handle":[48],"varying":[49],"poses.":[51],"We":[52],"introduce":[53],"self-supervised":[55],"representation":[57],"learning":[58],"model":[59,74],"(MVL),":[60],"ensuring":[61],"viewpoint-invariant":[62],"and":[63,107,134,138],"domain-agnostic":[64],"embeddings.":[65],"Moreover,":[66,116],"Unified":[68],"Modality":[69],"Adapter":[70],"(UMA)":[71],"enables":[72],"the":[73,92,117,120,126],"revert":[76],"audio-only":[78],"levels":[80],"inputs":[83],"are":[84],"unavailable.":[85],"Experimental":[86],"results":[87],"demonstrate":[88],"that,":[89],"compared":[90],"baseline":[93,121],"AV-HuBERT":[94],"model,":[95],"our":[96,129],"improves":[98],"lip-reading":[99],"accuracy":[100],"by":[101],"3.0%":[102],"under":[103,131],"large":[104],"pose":[105],"deviations":[106],"yields":[108],"12.3%":[110],"overall":[111],"gain":[112],"in":[113],"AVSR":[114],"performance.":[115],"system":[118],"outperforms":[119],"on":[122],"real":[123],"data,":[124],"confirming":[125],"generalizability":[127],"of":[128],"challenging":[132],"modality-missing":[135],"scenarios.":[136],"Code":[137],"Data:":[139],"https://yakumostudio.github.io/yakumo.github.io/mvss/mvss.html.":[140]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-30T00:00:00"}
