{"id":"https://openalex.org/W4389911672","doi":"https://doi.org/10.48550/arxiv.2312.09369","title":"Audio-visual fine-tuning of audio-only ASR models","display_name":"Audio-visual fine-tuning of audio-only ASR models","publication_year":2023,"publication_date":"2023-12-14","ids":{"openalex":"https://openalex.org/W4389911672","doi":"https://doi.org/10.48550/arxiv.2312.09369"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2312.09369","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.09369","pdf_url":"https://arxiv.org/pdf/2312.09369","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2312.09369","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5071979205","display_name":"Avner May","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"May, Avner","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023556869","display_name":"Dmitriy Serdyuk","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Serdyuk, Dmitriy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101531439","display_name":"Ankit Shah","orcid":"https://orcid.org/0000-0002-8838-5421"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shah, Ankit Parag","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048852285","display_name":"Otavio Braga","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Braga, Otavio","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5005881531","display_name":"Olivier Siohan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Siohan, Olivier","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5071979205"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9941999912261963,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.832573652267456},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.728243350982666},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.70728999376297},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5639971494674683},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.542343020439148},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.46462637186050415},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4536355137825012},{"id":"https://openalex.org/keywords/labeled-data","display_name":"Labeled data","score":0.41197121143341064},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.09701144695281982}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.832573652267456},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.728243350982666},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.70728999376297},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5639971494674683},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.542343020439148},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.46462637186050415},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4536355137825012},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.41197121143341064},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.09701144695281982},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2312.09369","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.09369","pdf_url":"https://arxiv.org/pdf/2312.09369","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2312.09369","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2312.09369","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2312.09369","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.09369","pdf_url":"https://arxiv.org/pdf/2312.09369","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"score":0.550000011920929,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4389911672.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2309273277","https://openalex.org/W1769849273","https://openalex.org/W2061937230","https://openalex.org/W1574295218","https://openalex.org/W113247760","https://openalex.org/W2547793174","https://openalex.org/W2132885390","https://openalex.org/W2070212102","https://openalex.org/W2132658536","https://openalex.org/W2544241817"],"abstract_inverted_index":{"Audio-visual":[0],"automatic":[1],"speech":[2],"recognition":[3],"(AV-ASR)":[4],"models":[5],"are":[6,46],"very":[7],"effective":[8],"at":[9],"reducing":[10],"word":[11],"error":[12],"rates":[13],"on":[14,39,88],"noisy":[15],"speech,":[16],"but":[17,43],"require":[18],"large":[19],"amounts":[20],"of":[21],"transcribed":[22,40],"AV":[23,41,73,125,138],"training":[24],"data.":[25],"Recently,":[26],"audio-visual":[27],"self-supervised":[28],"learning":[29],"(SSL)":[30],"approaches":[31],"have":[32],"been":[33],"developed":[34],"to":[35,106,116],"reduce":[36],"this":[37,53,79,114],"dependence":[38],"data,":[42],"these":[44,58],"methods":[45,61,87],"quite":[47],"complex":[48],"and":[49,65,70,101],"computationally":[50],"expensive.":[51],"In":[52],"work,":[54],"we":[55,109,111,130],"propose":[56],"replacing":[57],"expensive":[59],"AV-SSL":[60,86,133],"with":[62,83],"a":[63,118],"simple":[64],"fast":[66],"\\textit{audio-only}":[67],"SSL":[68],"method,":[69],"then":[71],"performing":[72],"supervised":[74],"fine-tuning.":[75],"We":[76],"show":[77,110],"that":[78],"approach":[80,115],"is":[81],"competitive":[82],"state-of-the-art":[84],"(SOTA)":[85],"the":[89],"LRS3-TED":[90],"benchmark":[91],"task":[92],"(within":[93],"0.5%":[94],"absolute":[95],"WER),":[96],"while":[97],"being":[98],"dramatically":[99],"simpler":[100],"more":[102],"efficient":[103],"(12-30x":[104],"faster":[105],"pre-train).":[107],"Furthermore,":[108],"can":[112],"extend":[113],"convert":[117],"SOTA":[119,132],"audio-only":[120],"ASR":[121],"model":[122],"into":[123],"an":[124],"model.":[126],"By":[127],"doing":[128],"so,":[129],"match":[131],"results,":[134],"even":[135],"though":[136],"no":[137],"data":[139],"was":[140],"used":[141],"during":[142],"pre-training.":[143]},"counts_by_year":[],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2025-10-10T00:00:00"}
