{"id":"https://openalex.org/W2071828724","doi":"https://doi.org/10.1145/1027933.1027972","title":"A segment-based audio-visual speech recognizer","display_name":"A segment-based audio-visual speech recognizer","publication_year":2004,"publication_date":"2004-10-13","ids":{"openalex":"https://openalex.org/W2071828724","doi":"https://doi.org/10.1145/1027933.1027972","mag":"2071828724"},"language":"en","primary_location":{"id":"doi:10.1145/1027933.1027972","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1027933.1027972","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 6th international conference on Multimodal interfaces","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114018312","display_name":"Timothy J. Hazen","orcid":"https://orcid.org/0009-0006-1413-9590"},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Timothy J. Hazen","raw_affiliation_strings":["MIT Computer Science and Artificial Intelligence Laboratory, Cambridge, MA","MIT, Computer Science and Artificial Intelligence Laboratory, Cambridge, MA"],"affiliations":[{"raw_affiliation_string":"MIT Computer Science and Artificial Intelligence Laboratory, Cambridge, MA","institution_ids":["https://openalex.org/I63966007"]},{"raw_affiliation_string":"MIT, Computer Science and Artificial Intelligence Laboratory, Cambridge, MA","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075906727","display_name":"Kate Saenko","orcid":"https://orcid.org/0000-0002-7564-7218"},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kate Saenko","raw_affiliation_strings":["MIT Computer Science and Artificial Intelligence Laboratory, Cambridge, MA","MIT, Computer Science and Artificial Intelligence Laboratory, Cambridge, MA"],"affiliations":[{"raw_affiliation_string":"MIT Computer Science and Artificial Intelligence Laboratory, Cambridge, MA","institution_ids":["https://openalex.org/I63966007"]},{"raw_affiliation_string":"MIT, Computer Science and Artificial Intelligence Laboratory, Cambridge, MA","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000811712","display_name":"Chia-Hao La","orcid":null},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chia-Hao La","raw_affiliation_strings":["MIT Computer Science and Artificial Intelligence Laboratory, Cambridge, MA","MIT, Computer Science and Artificial Intelligence Laboratory, Cambridge, MA"],"affiliations":[{"raw_affiliation_string":"MIT Computer Science and Artificial Intelligence Laboratory, Cambridge, MA","institution_ids":["https://openalex.org/I63966007"]},{"raw_affiliation_string":"MIT, Computer Science and Artificial Intelligence Laboratory, Cambridge, MA","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5112758056","display_name":"James Glass","orcid":"https://orcid.org/0000-0002-3097-360X"},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"James R. Glass","raw_affiliation_strings":["MIT Computer Science and Artificial Intelligence Laboratory, Cambridge, MA","MIT, Computer Science and Artificial Intelligence Laboratory, Cambridge, MA"],"affiliations":[{"raw_affiliation_string":"MIT Computer Science and Artificial Intelligence Laboratory, Cambridge, MA","institution_ids":["https://openalex.org/I63966007"]},{"raw_affiliation_string":"MIT, Computer Science and Artificial Intelligence Laboratory, Cambridge, MA","institution_ids":["https://openalex.org/I63966007"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5114018312"],"corresponding_institution_ids":["https://openalex.org/I63966007"],"apc_list":null,"apc_paid":null,"fwci":5.9996,"has_fulltext":false,"cited_by_count":109,"citation_normalized_percentile":{"value":0.96688949,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"235","last_page":"242"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.9933000206947327,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/timit","display_name":"TIMIT","score":0.9214491248130798},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.8493942022323608},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8398860096931458},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.7753145694732666},{"id":"https://openalex.org/keywords/audio-mining","display_name":"Audio mining","score":0.5411507487297058},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.48456278443336487},{"id":"https://openalex.org/keywords/speech-corpus","display_name":"Speech corpus","score":0.4727039337158203},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4558192789554596},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.4300157427787781},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.42698365449905396},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.41903817653656006},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.4141538143157959},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.3966808319091797},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.32041436433792114},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.2805732488632202},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.11707255244255066}],"concepts":[{"id":"https://openalex.org/C2778724510","wikidata":"https://www.wikidata.org/wiki/Q7670405","display_name":"TIMIT","level":3,"score":0.9214491248130798},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.8493942022323608},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8398860096931458},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.7753145694732666},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.5411507487297058},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48456278443336487},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.4727039337158203},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4558192789554596},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4300157427787781},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.42698365449905396},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.41903817653656006},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.4141538143157959},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3966808319091797},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32041436433792114},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.2805732488632202},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.11707255244255066},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/1027933.1027972","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1027933.1027972","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 6th international conference on Multimodal interfaces","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.41999998688697815,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W124434589","https://openalex.org/W196476531","https://openalex.org/W636971608","https://openalex.org/W1482526072","https://openalex.org/W1511268987","https://openalex.org/W1527240141","https://openalex.org/W1562289873","https://openalex.org/W1633840029","https://openalex.org/W1664248204","https://openalex.org/W2045956438","https://openalex.org/W2058107683","https://openalex.org/W2074417845","https://openalex.org/W2077804127","https://openalex.org/W2111732304","https://openalex.org/W2112348857","https://openalex.org/W2121486117","https://openalex.org/W2121783296","https://openalex.org/W2151043030","https://openalex.org/W2491224255","https://openalex.org/W3099202502","https://openalex.org/W6620774151"],"related_works":["https://openalex.org/W3127686677","https://openalex.org/W80423236","https://openalex.org/W2592014004","https://openalex.org/W2157598242","https://openalex.org/W1911859126","https://openalex.org/W3151376046","https://openalex.org/W3089379469","https://openalex.org/W642007152","https://openalex.org/W2903652364","https://openalex.org/W2147630093"],"abstract_inverted_index":{"This":[0,50],"paper":[1],"presents":[2],"the":[3,88],"development":[4],"and":[5],"evaluation":[6],"of":[7,38,42],"a":[8,17,28,63],"speaker-independent":[9],"audio-visual":[10,65],"speech":[11,44,89],"recognition":[12,81,90],"(AVSR)":[13],"system":[14,60],"that":[15],"utilizes":[16],"segment-based":[18],"modeling":[19],"strategy.":[20],"To":[21],"support":[22],"this":[23],"research,":[24],"we":[25],"have":[26,76],"collected":[27,45],"new":[29,51,58],"video":[30],"corpus,":[31],"called":[32],"Audio-Visual":[33],"TIMIT":[34],"(AV-TIMIT),":[35],"which":[36,61],"consists":[37],"4":[39],"total":[40],"hours":[41],"read":[43],"from":[46],"223":[47],"different":[48],"speakers.":[49],"corpus":[52],"was":[53],"used":[54],"to":[55],"evaluate":[56],"our":[57],"AVSR":[59],"incorporates":[62],"novel":[64],"integration":[66],"scheme":[67],"using":[68],"segment-constrained":[69],"Hidden":[70],"Markov":[71],"Models":[72],"(HMMs).":[73],"Preliminary":[74],"experiments":[75],"demonstrated":[77],"improvements":[78],"in":[79],"phonetic":[80],"performance":[82],"when":[83],"incorporating":[84],"visual":[85],"information":[86],"into":[87],"process.":[91]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":4},{"year":2020,"cited_by_count":7},{"year":2019,"cited_by_count":4},{"year":2018,"cited_by_count":5},{"year":2017,"cited_by_count":12},{"year":2016,"cited_by_count":6},{"year":2015,"cited_by_count":5},{"year":2014,"cited_by_count":7},{"year":2013,"cited_by_count":4},{"year":2012,"cited_by_count":4}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
