{"id":"https://openalex.org/W4416749793","doi":"https://doi.org/10.1109/iros60139.2025.11246474","title":"Can Real-Time Lipreading Improve Speech Recognition? A Systematic Exploration Using Human-Robot Interaction Data","display_name":"Can Real-Time Lipreading Improve Speech Recognition? A Systematic Exploration Using Human-Robot Interaction Data","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4416749793","doi":"https://doi.org/10.1109/iros60139.2025.11246474"},"language":"en","primary_location":{"id":"doi:10.1109/iros60139.2025.11246474","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros60139.2025.11246474","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114467546","display_name":"Sander Goetzee","orcid":null},"institutions":[{"id":"https://openalex.org/I865915315","display_name":"Vrije Universiteit Amsterdam","ror":"https://ror.org/008xxew50","country_code":"NL","type":"education","lineage":["https://openalex.org/I865915315"]}],"countries":["NL"],"is_corresponding":true,"raw_author_name":"Sander Goetzee","raw_affiliation_strings":["Vrije Unversiteit Amsterdam"],"affiliations":[{"raw_affiliation_string":"Vrije Unversiteit Amsterdam","institution_ids":["https://openalex.org/I865915315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100387803","display_name":"Yue Li","orcid":"https://orcid.org/0000-0002-5624-7235"},"institutions":[{"id":"https://openalex.org/I865915315","display_name":"Vrije Universiteit Amsterdam","ror":"https://ror.org/008xxew50","country_code":"NL","type":"education","lineage":["https://openalex.org/I865915315"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Yue Li","raw_affiliation_strings":["Vrije Unversiteit Amsterdam"],"affiliations":[{"raw_affiliation_string":"Vrije Unversiteit Amsterdam","institution_ids":["https://openalex.org/I865915315"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5071518251","display_name":"Koen V. Hindriks","orcid":"https://orcid.org/0000-0002-5707-5236"},"institutions":[{"id":"https://openalex.org/I865915315","display_name":"Vrije Universiteit Amsterdam","ror":"https://ror.org/008xxew50","country_code":"NL","type":"education","lineage":["https://openalex.org/I865915315"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Koen Hindriks","raw_affiliation_strings":["Vrije Unversiteit Amsterdam"],"affiliations":[{"raw_affiliation_string":"Vrije Unversiteit Amsterdam","institution_ids":["https://openalex.org/I865915315"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5114467546"],"corresponding_institution_ids":["https://openalex.org/I865915315"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.4643062,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"20608","last_page":"20615"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.8355000019073486,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.8355000019073486,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.03880000114440918,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.034299999475479126,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6176000237464905},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.45509999990463257},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.4523000121116638},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.3564999997615814},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.3521000146865845},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.33550000190734863},{"id":"https://openalex.org/keywords/statistical-model","display_name":"Statistical model","score":0.32440000772476196},{"id":"https://openalex.org/keywords/audio-mining","display_name":"Audio mining","score":0.30320000648498535}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7526000142097473},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7249000072479248},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6176000237464905},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.45509999990463257},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.4523000121116638},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4350000023841858},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3564999997615814},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.3521000146865845},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3456000089645386},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.33550000190734863},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.32440000772476196},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3043999969959259},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.30320000648498535},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.29280000925064087},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.2824999988079071},{"id":"https://openalex.org/C207347870","wikidata":"https://www.wikidata.org/wiki/Q371174","display_name":"Gesture","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2651999890804291},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2630999982357025},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.25360000133514404},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.25060001015663147}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/iros60139.2025.11246474","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros60139.2025.11246474","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"},{"id":"pmh:oai:research.vu.nl:openaire_cris_publications/ddaf60ad-b4cd-42f3-9b41-32331626b411","is_oa":false,"landing_page_url":"https://hdl.handle.net/1871.1/ddaf60ad-b4cd-42f3-9b41-32331626b411","pdf_url":null,"source":{"id":"https://openalex.org/S4306401107","display_name":"VU Research Portal","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I865915315","host_organization_name":"Vrije Universiteit Amsterdam","host_organization_lineage":["https://openalex.org/I865915315"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Goetzee, S, Li, Y & Hindriks, K 2025, Can Real-Time Lipreading Improve Speech Recognition? A Systematic Exploration Using Human-Robot Interaction Data. in C Laugier, A Renzaglia, N Atanasov, S Birchfield, G Cielniak, L De Mattos, L Fiorini, P Giguere, K Hashimoto, J Ibanez-Guzman, T Kamegawa, J Lee, G Loianno, K Luck, H Maruyama, P Martinet, H Moradi, U Nunes, J Pettre, A Pretto, T Ranzani, A Ronnau, S Rossi, E Rouse, F Ruggiero, O Simonin, D Wang, M Yang, E Yoshida & H Zhao (eds), 2025 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS) : [Proceedings]. IEEE International Conference on Intelligent Robots and Systems, Institute of Electrical and Electronics Engineers Inc., pp. 20608-20615. https://doi.org/10.1109/IROS60139.2025.11246474","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:research.vu.nl:publications/ddaf60ad-b4cd-42f3-9b41-32331626b411","is_oa":false,"landing_page_url":"https://research.vu.nl/en/publications/ddaf60ad-b4cd-42f3-9b41-32331626b411","pdf_url":null,"source":{"id":"https://openalex.org/S4306401107","display_name":"VU Research Portal","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I865915315","host_organization_name":"Vrije Universiteit Amsterdam","host_organization_lineage":["https://openalex.org/I865915315"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Goetzee, S, Li, Y & Hindriks, K 2025, Can Real-Time Lipreading Improve Speech Recognition? A Systematic Exploration Using Human-Robot Interaction Data. in C Laugier, A Renzaglia, N Atanasov, S Birchfield, G Cielniak, L De Mattos, L Fiorini, P Giguere, K Hashimoto, J Ibanez-Guzman, T Kamegawa, J Lee, G Loianno, K Luck, H Maruyama, P Martinet, H Moradi, U Nunes, J Pettre, A Pretto, T Ranzani, A Ronnau, S Rossi, E Rouse, F Ruggiero, O Simonin, D Wang, M Yang, E Yoshida & H Zhao (eds), 2025 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS) : [Proceedings]. IEEE International Conference on Intelligent Robots and Systems, Institute of Electrical and Electronics Engineers Inc., pp. 20608-20615. https://doi.org/10.1109/IROS60139.2025.11246474","raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1987454298","https://openalex.org/W2030527220","https://openalex.org/W2075094361","https://openalex.org/W2099739380","https://openalex.org/W2105411308","https://openalex.org/W2115973835","https://openalex.org/W2116034851","https://openalex.org/W2124343636","https://openalex.org/W2154353037","https://openalex.org/W2871950322","https://openalex.org/W2890952074","https://openalex.org/W2974141411","https://openalex.org/W3096104971","https://openalex.org/W3162665866","https://openalex.org/W3176461442","https://openalex.org/W4231416407","https://openalex.org/W4231807801","https://openalex.org/W4372346152","https://openalex.org/W4383616896","https://openalex.org/W4391021755","https://openalex.org/W4391689980","https://openalex.org/W4392903177","https://openalex.org/W4394593204","https://openalex.org/W4399267571","https://openalex.org/W4400090069","https://openalex.org/W4403919280","https://openalex.org/W4403920023"],"related_works":[],"abstract_inverted_index":{"Speech":[0,11],"recognition":[1,15,59,190],"in":[2,24,33,46,146,263,270,286],"Human-Robot":[3],"Interaction":[4],"(HRI)":[5],"fully":[6],"relies":[7,17],"on":[8,19,71,87,108,258],"audio-based":[9],"Automatic":[10],"Recognition.":[12],"However,":[13],"speech":[14,50,58,189],"that":[16,56,103,201,210,248,276],"solely":[18],"audio":[20,84,203],"faces":[21],"significant":[22],"challenges":[23],"noisy":[25,156],"environments":[26],"and":[27,127,134,152,155,165,177,195,235,241],"may":[28],"lead":[29],"to":[30,38,42,120,219,225,232,265,282],"poor":[31],"performance":[32,130,186,198,215],"such":[34,115],"environments.":[35,123],"One":[36],"approach":[37],"address":[39],"this":[40,76],"is":[41,279],"also":[43,166,207],"use":[44],"lipreading":[45,86,285],"combination":[47],"with":[48,85,138,204],"traditional":[49],"recognition.":[51],"Recent":[52],"work":[53,278],"has":[54,94,106,267],"shown":[55],"audiovisual":[57,136,256],"(AVSR)":[60],"can":[61,117],"achieve":[62],"a":[63,88,148,153,259],"Word":[64],"Error":[65],"Rate":[66],"(WER)":[67],"of":[68,82,131,171,216],"only":[69],"0.9%":[70],"the":[72,80,129,161,168,213,226,251,255,271],"dataset":[73],"LRS3.":[74],"In":[75],"paper,":[77],"we":[78],"assess":[79],"potential":[81],"combining":[83],"social":[89,121,260],"robot":[90,122,261],"platform,":[91,262],"Pepper,":[92],"which":[93],"not":[95],"yet":[96],"been":[97,268],"widely":[98],"tested":[99],"for":[100,187,199,249],"AVSR.":[101],"Given":[102],"prior":[104],"research":[105],"focused":[107],"non-robotic":[109],"domains,":[110],"it":[111],"remains":[112],"unclear":[113],"whether":[114],"models":[116,137,173,200,218,253,257],"generalize":[118],"well":[119],"We":[124,159,206,274],"systematically":[125],"evaluate":[126],"compare":[128],"established":[132],"offline":[133],"real-time":[135,172,185,197],"their":[139],"audio-only":[140,188,252],"counterparts.":[141],"The":[142,182],"experiments":[143],"were":[144],"conducted":[145],"both":[147],"controlled":[149],"laboratory":[150],"setting":[151],"dynamic":[154],"public":[157],"environment.":[158],"evaluated":[160],"data":[162],"using":[163],"WER":[164],"measured":[167],"inference":[169,214],"latency":[170,193],"via":[174],"Real-Time":[175],"Factor":[176],"Words":[178],"Per":[179],"Second":[180],"rates.":[181],"results":[183],"demonstrate":[184],"across":[191],"all":[192],"metrics":[194],"near":[196],"combine":[202],"lipreading.":[205],"explored":[208],"factors":[209,230],"might":[211],"influence":[212],"these":[217],"understand":[220],"how":[221],"much":[222],"video":[223],"contributes":[224],"audio.":[227],"This":[228],"includes":[229],"related":[231],"(1)":[233],"environmental":[234],"temporal":[236],"variations,":[237],"(2)":[238],"model":[239],"behavior,":[240],"(3)":[242],"implementation":[243],"choices.":[244],"Our":[245],"findings":[246],"indicate":[247],"now":[250],"outperform":[254],"contrast":[264],"what":[266],"reported":[269],"benchmarked":[272],"literature.":[273],"conclude":[275],"more":[277],"still":[280],"needed":[281],"benefit":[283],"from":[284],"HRI.":[287]},"counts_by_year":[],"updated_date":"2026-03-11T06:11:40.159057","created_date":"2025-11-28T00:00:00"}
