{"id":"https://openalex.org/W4406461849","doi":"https://doi.org/10.1109/slt61566.2024.10832240","title":"Crossmodal ASR Error Correction With Discrete Speech Units","display_name":"Crossmodal ASR Error Correction With Discrete Speech Units","publication_year":2024,"publication_date":"2024-12-02","ids":{"openalex":"https://openalex.org/W4406461849","doi":"https://doi.org/10.1109/slt61566.2024.10832240"},"language":"en","primary_location":{"id":"doi:10.1109/slt61566.2024.10832240","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832240","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://www.research.ed.ac.uk/files/498065239/2405.16677v2.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5077749078","display_name":"Yuanchao Li","orcid":"https://orcid.org/0000-0002-8005-947X"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Yuanchao Li","raw_affiliation_strings":["University of Edinburgh,UK"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh,UK","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023609587","display_name":"Pinzhen Chen","orcid":"https://orcid.org/0000-0003-0089-5118"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Pinzhen Chen","raw_affiliation_strings":["University of Edinburgh,UK"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh,UK","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102911387","display_name":"Peter Bell","orcid":"https://orcid.org/0000-0002-9597-9615"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Peter Bell","raw_affiliation_strings":["University of Edinburgh,UK"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh,UK","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5058881584","display_name":"Catherine Lai","orcid":"https://orcid.org/0000-0003-2411-8954"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Catherine Lai","raw_affiliation_strings":["University of Edinburgh,UK"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh,UK","institution_ids":["https://openalex.org/I98677209"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5077749078"],"corresponding_institution_ids":["https://openalex.org/I98677209"],"apc_list":null,"apc_paid":null,"fwci":1.844,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.88353271,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"431","last_page":"438"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/crossmodal","display_name":"Crossmodal","score":0.7889894247055054},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7431589365005493},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6706265807151794},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.09625568985939026},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.08120092749595642},{"id":"https://openalex.org/keywords/neuroscience","display_name":"Neuroscience","score":0.056353747844696045}],"concepts":[{"id":"https://openalex.org/C60115397","wikidata":"https://www.wikidata.org/wiki/Q5188732","display_name":"Crossmodal","level":4,"score":0.7889894247055054},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7431589365005493},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6706265807151794},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.09625568985939026},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.08120092749595642},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.056353747844696045},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/slt61566.2024.10832240","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt61566.2024.10832240","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},{"id":"pmh:oai:pure.ed.ac.uk:openaire/910845f1-0196-4dec-9936-ba485240857e","is_oa":true,"landing_page_url":"https://www.research.ed.ac.uk/files/498065239/2405.16677v2.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4406922455","display_name":"Edinburgh Research Explorer","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Li, Y, Chen, P, Bell, P & Lai, C 2025, Crossmodal ASR error correction with discrete speech units. in Proceedings of 2024 IEEE Spoken Language Technology Workshop (SLT). IEEE Workshop on Spoken Language Technology, Institute of Electrical and Electronics Engineers, Singapore, pp. 431-438, IEEE Spoken Language Technology Workshop 2024 , Macau, China, 2/12/24. https://doi.org/10.1109/SLT61566.2024.10832240","raw_type":"contributionToPeriodical"},{"id":"pmh:oai:pure.ed.ac.uk:publications/910845f1-0196-4dec-9936-ba485240857e","is_oa":true,"landing_page_url":"https://www.research.ed.ac.uk/en/publications/910845f1-0196-4dec-9936-ba485240857e","pdf_url":null,"source":{"id":"https://openalex.org/S4406922455","display_name":"Edinburgh Research Explorer","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Li, Y, Chen, P, Bell, P & Lai, C 2025, Crossmodal ASR error correction with discrete speech units. in Proceedings of 2024 IEEE Spoken Language Technology Workshop (SLT). IEEE Workshop on Spoken Language Technology, Institute of Electrical and Electronics Engineers, Singapore, pp. 431-438, IEEE Spoken Language Technology Workshop 2024 , Macau, China, 2/12/24. https://doi.org/10.1109/SLT61566.2024.10832240","raw_type":"contributionToPeriodical"}],"best_oa_location":{"id":"pmh:oai:pure.ed.ac.uk:openaire/910845f1-0196-4dec-9936-ba485240857e","is_oa":true,"landing_page_url":"https://www.research.ed.ac.uk/files/498065239/2405.16677v2.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4406922455","display_name":"Edinburgh Research Explorer","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Li, Y, Chen, P, Bell, P & Lai, C 2025, Crossmodal ASR error correction with discrete speech units. in Proceedings of 2024 IEEE Spoken Language Technology Workshop (SLT). IEEE Workshop on Spoken Language Technology, Institute of Electrical and Electronics Engineers, Singapore, pp. 431-438, IEEE Spoken Language Technology Workshop 2024 , Macau, China, 2/12/24. https://doi.org/10.1109/SLT61566.2024.10832240","raw_type":"contributionToPeriodical"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.4000000059604645}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":40,"referenced_works":["https://openalex.org/W2059652594","https://openalex.org/W2146334809","https://openalex.org/W2525778437","https://openalex.org/W2556418146","https://openalex.org/W2616647696","https://openalex.org/W2742542661","https://openalex.org/W2747874407","https://openalex.org/W2888936194","https://openalex.org/W2916997151","https://openalex.org/W2962780374","https://openalex.org/W3016256870","https://openalex.org/W3036601975","https://openalex.org/W3097777922","https://openalex.org/W3196473441","https://openalex.org/W3197976821","https://openalex.org/W3208480086","https://openalex.org/W3209059054","https://openalex.org/W4226175622","https://openalex.org/W4292595872","https://openalex.org/W4297841805","https://openalex.org/W4319862479","https://openalex.org/W4372269226","https://openalex.org/W4375868963","https://openalex.org/W4375869060","https://openalex.org/W4375869259","https://openalex.org/W4385807453","https://openalex.org/W4385822254","https://openalex.org/W4385822632","https://openalex.org/W4386566539","https://openalex.org/W4389520395","https://openalex.org/W4391709581","https://openalex.org/W4401609030","https://openalex.org/W6686282164","https://openalex.org/W6727690538","https://openalex.org/W6771467084","https://openalex.org/W6775659032","https://openalex.org/W6780218876","https://openalex.org/W6810818841","https://openalex.org/W6847363464","https://openalex.org/W6860809563"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W4240440807","https://openalex.org/W953566696","https://openalex.org/W2010220987","https://openalex.org/W2010927954","https://openalex.org/W2085535992","https://openalex.org/W4386123105","https://openalex.org/W3117345873"],"abstract_inverted_index":{"ASR":[0,15,24,69,146],"remains":[1],"unsatisfactory":[2],"in":[3,18],"scenarios":[4],"where":[5],"the":[6,42,85,96,112],"speaking":[7],"style":[8],"diverges":[9],"from":[10,104],"that":[11,142],"used":[12],"to":[13,91],"train":[14],"systems,":[16],"resulting":[17],"erroneous":[19],"transcripts.":[20],"To":[21],"address":[22],"this,":[23],"Error":[25],"Correction":[26],"(AEC),":[27],"a":[28,135],"post-ASR":[29],"processing":[30],"approach,":[31],"is":[32],"required.":[33],"In":[34],"this":[35],"work,":[36],"we":[37,83],"tackle":[38],"an":[39,68],"understudied":[40],"issue:":[41],"Low-Resource":[43],"Out-of-Domain":[44],"(LROOD)":[45],"problem,":[46],"by":[47],"investigating":[48],"crossmodal":[49],"AEC":[50,101,119],"on":[51,75,121,131,137],"very":[52],"limited":[53],"downstream":[54,151],"data":[55,123],"with":[56,93],"1-best":[57],"hypothesis":[58],"transcription.":[59],"We":[60],"explore":[61],"pretraining":[62],"and":[63,66,94,107,114,129],"fine-tuning":[64],"strategies":[65],"uncover":[67],"domain":[70],"discrepancy":[71],"phenomenon,":[72],"shedding":[73],"light":[74],"appropriate":[76],"training":[77],"schemes":[78],"for":[79,99,150],"LROOD":[80,122],"data.":[81,133],"Moreover,":[82],"propose":[84],"incorporation":[86],"of":[87,116],"discrete":[88],"speech":[89,138],"units":[90],"align":[92],"enhance":[95],"word":[97],"embeddings":[98],"improving":[100],"quality.":[102],"Results":[103],"multiple":[105],"corpora":[106],"several":[108],"evaluation":[109],"metrics":[110],"demonstrate":[111],"feasibility":[113],"efficacy":[115],"our":[117,143],"proposed":[118],"approach":[120],"as":[124,126],"well":[125],"its":[127],"generalizability":[128],"superiority":[130],"large-scale":[132],"Finally,":[134],"study":[136],"emotion":[139],"recognition":[140],"confirms":[141],"model":[144],"produces":[145],"error-robust":[147],"transcripts":[148],"suitable":[149],"applications.":[152]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":2}],"updated_date":"2026-03-04T09:10:02.777135","created_date":"2025-10-10T00:00:00"}
