{"id":"https://openalex.org/W4321648689","doi":"https://doi.org/10.48550/arxiv.2302.10915","title":"Conformers are All You Need for Visual Speech Recognition","display_name":"Conformers are All You Need for Visual Speech Recognition","publication_year":2023,"publication_date":"2023-02-17","ids":{"openalex":"https://openalex.org/W4321648689","doi":"https://doi.org/10.48550/arxiv.2302.10915"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2302.10915","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2302.10915","pdf_url":"https://arxiv.org/pdf/2302.10915","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2302.10915","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073775071","display_name":"Oscar Chang","orcid":"https://orcid.org/0000-0002-4336-7545"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chang, Oscar","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110698977","display_name":"Hank Liao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liao, Hank","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023556869","display_name":"Dmitriy Serdyuk","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Serdyuk, Dmitriy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101531439","display_name":"Ankit Shah","orcid":"https://orcid.org/0000-0002-8838-5421"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shah, Ankit","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5005881531","display_name":"Olivier Siohan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Siohan, Olivier","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5073775071"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10326","display_name":"Indoor and Outdoor Localization Technologies","score":0.9919999837875366,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9793000221252441,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.725725531578064},{"id":"https://openalex.org/keywords/receptive-field","display_name":"Receptive field","score":0.6588113903999329},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6313692331314087},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6244585514068604},{"id":"https://openalex.org/keywords/front-and-back-ends","display_name":"Front and back ends","score":0.6116266250610352},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.4844813048839569},{"id":"https://openalex.org/keywords/front","display_name":"Front (military)","score":0.4579334557056427},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4266253113746643},{"id":"https://openalex.org/keywords/visual-field","display_name":"Visual field","score":0.4231802821159363},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.14856937527656555},{"id":"https://openalex.org/keywords/neuroscience","display_name":"Neuroscience","score":0.10106128454208374},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.06780663132667542}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.725725531578064},{"id":"https://openalex.org/C19071747","wikidata":"https://www.wikidata.org/wiki/Q1755207","display_name":"Receptive field","level":2,"score":0.6588113903999329},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6313692331314087},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6244585514068604},{"id":"https://openalex.org/C53016008","wikidata":"https://www.wikidata.org/wiki/Q620167","display_name":"Front and back ends","level":2,"score":0.6116266250610352},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.4844813048839569},{"id":"https://openalex.org/C2777551076","wikidata":"https://www.wikidata.org/wiki/Q842332","display_name":"Front (military)","level":2,"score":0.4579334557056427},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4266253113746643},{"id":"https://openalex.org/C2776058522","wikidata":"https://www.wikidata.org/wiki/Q2364768","display_name":"Visual field","level":2,"score":0.4231802821159363},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.14856937527656555},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.10106128454208374},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.06780663132667542},{"id":"https://openalex.org/C78519656","wikidata":"https://www.wikidata.org/wiki/Q101333","display_name":"Mechanical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2302.10915","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2302.10915","pdf_url":"https://arxiv.org/pdf/2302.10915","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2302.10915","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2302.10915","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2302.10915","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2302.10915","pdf_url":"https://arxiv.org/pdf/2302.10915","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2089544495","https://openalex.org/W2079003682","https://openalex.org/W1555021777","https://openalex.org/W2913266608","https://openalex.org/W2799648451","https://openalex.org/W1964918325","https://openalex.org/W2189496153","https://openalex.org/W2186491718","https://openalex.org/W2034008118","https://openalex.org/W2055164815"],"abstract_inverted_index":{"Visual":[0],"speech":[1,77,134],"recognition":[2,135],"models":[3,147],"extract":[4,72],"visual":[5,18,66,85,97,104,133],"features":[6,75],"in":[7,113],"a":[8,17,21,54,95,102,108,126],"hierarchical":[9],"manner.":[10],"At":[11,36],"the":[12,28,32,37,47,51,65,69,137,143],"lower":[13,114],"level,":[14,39],"there":[15,40],"is":[16,41],"front-end":[19,52,67,105],"with":[20,107],"limited":[22],"temporal":[23,56],"receptive":[24,57],"field":[25],"that":[26,44,83,101],"processes":[27],"raw":[29],"pixels":[30],"depicting":[31],"lips":[33],"or":[34],"faces.":[35],"higher":[38],"an":[42],"encoder":[43,111],"attends":[45],"to":[46,71,94],"embeddings":[48],"produced":[49],"by":[50],"over":[53],"large":[55],"field.":[58],"Previous":[59],"work":[60,81],"has":[61],"focused":[62],"on":[63,136],"improving":[64],"of":[68,91,129,145],"model":[70],"more":[73,116],"useful":[74],"for":[76,132],"recognition.":[78],"Surprisingly,":[79],"our":[80],"shows":[82],"complex":[84],"front-ends":[86],"are":[87],"not":[88],"necessary.":[89],"Instead":[90],"allocating":[92],"resources":[93],"sophisticated":[96],"front-end,":[98],"we":[99],"find":[100],"linear":[103],"paired":[106],"larger":[109],"Conformer":[110],"results":[112],"latency,":[115],"efficient":[117],"memory":[118],"usage,":[119],"and":[120],"improved":[121],"WER":[122,131],"performance.":[123],"We":[124],"achieve":[125],"new":[127],"state-of-the-art":[128],"12.8%":[130],"TED":[138],"LRS3":[139],"dataset,":[140],"which":[141],"rivals":[142],"performance":[144],"audio-only":[146],"from":[148],"just":[149],"four":[150],"years":[151],"ago.":[152]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2025-10-10T00:00:00"}
