{"id":"https://openalex.org/W4281892454","doi":"https://doi.org/10.1145/3536221.3556571","title":"Is Lip Region-of-Interest Sufficient for Lipreading?","display_name":"Is Lip Region-of-Interest Sufficient for Lipreading?","publication_year":2022,"publication_date":"2022-11-04","ids":{"openalex":"https://openalex.org/W4281892454","doi":"https://doi.org/10.1145/3536221.3556571"},"language":"en","primary_location":{"id":"doi:10.1145/3536221.3556571","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3536221.3556571","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2022 International Conference on Multimodal Interaction","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073662121","display_name":"Jing-Xuan Zhang","orcid":"https://orcid.org/0000-0003-4341-3174"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jing-Xuan Zhang","raw_affiliation_strings":["iFLYTEK Research, iFLYTEK Co., Ltd., China and University of Science and Technology of China, China"],"affiliations":[{"raw_affiliation_string":"iFLYTEK Research, iFLYTEK Co., Ltd., China and University of Science and Technology of China, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011133553","display_name":"Genshun Wan","orcid":"https://orcid.org/0000-0002-5813-9430"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Genshun Wan","raw_affiliation_strings":["iFLYTEK Research, iFLYTEK Co., Ltd, China"],"affiliations":[{"raw_affiliation_string":"iFLYTEK Research, iFLYTEK Co., Ltd, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5076812698","display_name":"Jia Pan","orcid":"https://orcid.org/0000-0001-9003-2054"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jia Pan","raw_affiliation_strings":["iFLYTEK Research, iFLYTEK Co., Ltd., China"],"affiliations":[{"raw_affiliation_string":"iFLYTEK Research, iFLYTEK Co., Ltd., China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5073662121"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":1.1905,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.77144783,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"368","last_page":"372"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9930999875068665,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13289","display_name":"Infant Health and Development","score":0.9631999731063843,"subfield":{"id":"https://openalex.org/subfields/3611","display_name":"Pharmacy"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7203828692436218},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.6980932950973511},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6712796688079834},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5836281776428223},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5713100433349609},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.555664598941803},{"id":"https://openalex.org/keywords/facial-recognition-system","display_name":"Facial recognition system","score":0.5022430419921875},{"id":"https://openalex.org/keywords/extractor","display_name":"Extractor","score":0.49656325578689575},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4816652536392212},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.42261332273483276},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.4224373996257782},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.41577261686325073},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.410061240196228},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.11731967329978943}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7203828692436218},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.6980932950973511},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6712796688079834},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5836281776428223},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5713100433349609},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.555664598941803},{"id":"https://openalex.org/C31510193","wikidata":"https://www.wikidata.org/wiki/Q1192553","display_name":"Facial recognition system","level":3,"score":0.5022430419921875},{"id":"https://openalex.org/C117978034","wikidata":"https://www.wikidata.org/wiki/Q5422192","display_name":"Extractor","level":2,"score":0.49656325578689575},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4816652536392212},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.42261332273483276},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.4224373996257782},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.41577261686325073},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.410061240196228},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.11731967329978943},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C21880701","wikidata":"https://www.wikidata.org/wiki/Q2144042","display_name":"Process engineering","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3536221.3556571","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3536221.3556571","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2022 International Conference on Multimodal Interaction","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.5699999928474426}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W1503933356","https://openalex.org/W1664547674","https://openalex.org/W2046056978","https://openalex.org/W2051676197","https://openalex.org/W2164240571","https://openalex.org/W2267805933","https://openalex.org/W2404901536","https://openalex.org/W2405713570","https://openalex.org/W2556171197","https://openalex.org/W2585824449","https://openalex.org/W2890952074","https://openalex.org/W2963654155","https://openalex.org/W2963785710","https://openalex.org/W2963894079","https://openalex.org/W3015501067","https://openalex.org/W3015830103","https://openalex.org/W3016011581","https://openalex.org/W3035042697","https://openalex.org/W3036601975","https://openalex.org/W3099231112","https://openalex.org/W3128564814","https://openalex.org/W3162293946","https://openalex.org/W3174954176","https://openalex.org/W3209059054"],"related_works":["https://openalex.org/W1979583797","https://openalex.org/W3082848404","https://openalex.org/W2016864125","https://openalex.org/W2372254676","https://openalex.org/W2793679056","https://openalex.org/W3145050838","https://openalex.org/W2080135837","https://openalex.org/W1941834444","https://openalex.org/W2985118265","https://openalex.org/W2944691285"],"abstract_inverted_index":{"Lip":[0],"region-of-interest":[1],"(ROI)":[2],"is":[3,77],"conventionally":[4],"used":[5],"for":[6,89],"visual":[7,21,38,65,135],"input":[8,22,144,153],"in":[9,103,154],"the":[10,17,27,74,86,112,124,129,140,155],"lipreading":[11,90,125],"task.":[12],"Few":[13],"works":[14],"have":[15],"adopted":[16,102],"entire":[18,75,87,113],"face":[19,28,76,88,114,143],"as":[20,51,134],"because":[23],"lip-excluded":[24],"parts":[25],"of":[26,157,172],"are":[29],"usually":[30],"considered":[31],"to":[32,37,84],"be":[33],"redundant":[34],"and":[35],"irrelevant":[36],"speech":[39,66],"recognition.":[40],"However,":[41],"faces":[42],"contain":[43],"much":[44],"more":[45],"detailed":[46],"information":[47,62],"than":[48,149],"lips,":[49],"such":[50,61],"speakers\u2019":[52],"head":[53],"pose,":[54],"emotion,":[55],"identity":[56],"etc.":[57],"We":[58],"argue":[59],"that":[60,110,150],"might":[63],"benefit":[64],"recognition":[67],"if":[68],"a":[69,146,164],"powerful":[70],"feature":[71],"extractor":[72],"employing":[73],"trained.":[78],"In":[79],"this":[80],"work,":[81],"we":[82],"propose":[83],"adopt":[85],"with":[91,128,142],"self-supervised":[92,98,138],"learning.":[93],"AV-HuBERT,":[94],"an":[95],"audio-visual":[96],"multi-modal":[97],"learning":[99],"framework,":[100],"was":[101],"our":[104],"experiments.":[105],"Our":[106],"experimental":[107],"results":[108],"showed":[109],"adopting":[111],"achieved":[115,145],"16%":[116],"relative":[117],"word":[118],"error":[119],"rate":[120],"(WER)":[121],"reduction":[122],"on":[123],"task,":[126],"compared":[127],"baseline":[130],"method":[131],"using":[132,151,169],"lip":[133,152],"input.":[136],"Without":[137],"pretraining,":[139],"model":[141],"higher":[147],"WER":[148,167],"case":[156],"limited":[158],"training":[159,173],"data":[160,174],"(30":[161],"hours),":[162],"while":[163],"slightly":[165],"lower":[166],"when":[168],"large":[170],"amount":[171],"(433":[175],"hours).":[176]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
