{"id":"https://openalex.org/W4226324628","doi":"https://doi.org/10.1109/asru51503.2021.9688019","title":"Semi-Supervised Transfer Learning for Language Expansion of End-to-End Speech Recognition Models to Low-Resource Languages","display_name":"Semi-Supervised Transfer Learning for Language Expansion of End-to-End Speech Recognition Models to Low-Resource Languages","publication_year":2021,"publication_date":"2021-12-13","ids":{"openalex":"https://openalex.org/W4226324628","doi":"https://doi.org/10.1109/asru51503.2021.9688019"},"language":"en","primary_location":{"id":"doi:10.1109/asru51503.2021.9688019","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru51503.2021.9688019","pdf_url":null,"source":{"id":"https://openalex.org/S4363606113","display_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100396580","display_name":"Jiyeon Kim","orcid":"https://orcid.org/0000-0001-7964-5060"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Jiyeon Kim","raw_affiliation_strings":["AI Center, Samsung Research,Speech Processing Lab,South Korea"],"affiliations":[{"raw_affiliation_string":"AI Center, Samsung Research,Speech Processing Lab,South Korea","institution_ids":["https://openalex.org/I2250650973"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050871485","display_name":"Mehul Kumar","orcid":"https://orcid.org/0000-0001-5556-5904"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Mehul Kumar","raw_affiliation_strings":["AI Center, Samsung Research,Speech Processing Lab,South Korea"],"affiliations":[{"raw_affiliation_string":"AI Center, Samsung Research,Speech Processing Lab,South Korea","institution_ids":["https://openalex.org/I2250650973"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113125574","display_name":"Dhananjaya Gowda","orcid":null},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Dhananjaya Gowda","raw_affiliation_strings":["AI Center, Samsung Research,Speech Processing Lab,South Korea"],"affiliations":[{"raw_affiliation_string":"AI Center, Samsung Research,Speech Processing Lab,South Korea","institution_ids":["https://openalex.org/I2250650973"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043920881","display_name":"Abhinav Garg","orcid":"https://orcid.org/0000-0001-5082-5500"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Abhinav Garg","raw_affiliation_strings":["AI Center, Samsung Research,Speech Processing Lab,South Korea"],"affiliations":[{"raw_affiliation_string":"AI Center, Samsung Research,Speech Processing Lab,South Korea","institution_ids":["https://openalex.org/I2250650973"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100684422","display_name":"Chanwoo Kim","orcid":"https://orcid.org/0000-0003-0193-8167"},"institutions":[{"id":"https://openalex.org/I2250650973","display_name":"Samsung (South Korea)","ror":"https://ror.org/04w3jy968","country_code":"KR","type":"company","lineage":["https://openalex.org/I2250650973"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Chanwoo Kim","raw_affiliation_strings":["AI Center, Samsung Research,Speech Processing Lab,South Korea"],"affiliations":[{"raw_affiliation_string":"AI Center, Samsung Research,Speech Processing Lab,South Korea","institution_ids":["https://openalex.org/I2250650973"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100396580"],"corresponding_institution_ids":["https://openalex.org/I2250650973"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.25093458,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"984","last_page":"988"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9763000011444092,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9746999740600586,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.7970657348632812},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7669894099235535},{"id":"https://openalex.org/keywords/transfer-of-learning","display_name":"Transfer of learning","score":0.7167524695396423},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6266542077064514},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5986955761909485},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5560247898101807},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.49360764026641846},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.4904346168041229},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.44702601432800293},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.43023547530174255},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.41084423661231995},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.23984068632125854}],"concepts":[{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.7970657348632812},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7669894099235535},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.7167524695396423},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6266542077064514},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5986955761909485},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5560247898101807},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.49360764026641846},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.4904346168041229},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.44702601432800293},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.43023547530174255},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.41084423661231995},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.23984068632125854},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru51503.2021.9688019","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru51503.2021.9688019","pdf_url":null,"source":{"id":"https://openalex.org/S4363606113","display_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.8100000023841858,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W587565084","https://openalex.org/W2091746061","https://openalex.org/W2105806712","https://openalex.org/W2107508685","https://openalex.org/W2526425061","https://openalex.org/W2912937645","https://openalex.org/W2913818371","https://openalex.org/W2963303951","https://openalex.org/W2963362078","https://openalex.org/W2963891433","https://openalex.org/W2972584841","https://openalex.org/W2972816482","https://openalex.org/W2972889948","https://openalex.org/W2973049979","https://openalex.org/W3008587939","https://openalex.org/W3015927303","https://openalex.org/W3026041220","https://openalex.org/W3094647783","https://openalex.org/W3095193086","https://openalex.org/W3095867028","https://openalex.org/W3096758108","https://openalex.org/W3097649098","https://openalex.org/W3168770049","https://openalex.org/W6601906477","https://openalex.org/W6743965040","https://openalex.org/W6746029170","https://openalex.org/W6780218876"],"related_works":["https://openalex.org/W5949257","https://openalex.org/W1226999","https://openalex.org/W9638664","https://openalex.org/W6397387","https://openalex.org/W10064791","https://openalex.org/W17817017","https://openalex.org/W8493306","https://openalex.org/W2195354","https://openalex.org/W13021510","https://openalex.org/W6374234"],"abstract_inverted_index":{"In":[0,72,111,153],"this":[1,137],"paper,":[2],"we":[3,51,76,156],"propose":[4,21],"a":[5,47,53,81,94,178,188,195],"three-stage":[6],"training":[7],"methodology":[8],"to":[9,122,202],"improve":[10,43],"the":[11,44,73,90,109,127,133,146,158,184,191,203],"speech":[12,174],"recognition":[13,175],"accuracy":[14,45],"of":[15,25,46,198],"low-resource":[16,48],"languages.":[17],"We":[18,114,129],"explore":[19,131],"and":[20,38,60,69,187],"an":[22],"effective":[23],"combination":[24],"techniques":[26],"such":[27],"as":[28],"transfer":[29,65,78],"learning,":[30,66],"encoder":[31,135],"freezing,":[32],"data":[33,118,140],"augmentation":[34,141],"using":[35,64,165],"Text-To-Speech":[36],"(TTS),":[37],"Semi-Supervised":[39],"Learning":[40],"(SSL).":[41],"To":[42],"Italian":[49],"ASR,":[50],"leverage":[52],"well-trained":[54,82],"English":[55,83],"model,":[56],"unlabeled":[57,61,116,168],"text":[58,117],"corpus,":[59],"audio":[62,169],"corpus":[63],"TTS":[67,120,139],"augmentation,":[68],"SSL":[70,166],"respectively.":[71],"first":[74,185],"stage,":[75],"use":[77],"learning":[79,89],"from":[80,93,167],"model.":[84,128],"This":[85,97],"primarily":[86],"helps":[87,142],"in":[88,183,190],"acoustic":[91,134],"information":[92,125],"resource-rich":[95],"language.":[96],"stage":[98,112,154],"achieves":[99,194],"around":[100],"24%":[101],"relative":[102,163,201],"Word":[103],"Error":[104],"Rate":[105],"(WER)":[106],"reduction":[107,197],"over":[108],"baseline.":[110,204],"two,":[113],"utilize":[115],"via":[119],"data-augmentation":[121],"incorporate":[123],"language":[124],"into":[126],"also":[130],"freezing":[132],"at":[136],"stage.":[138],"us":[143],"further":[144],"reduce":[145,157],"WER":[147,159,196],"by":[148,160,164],"~":[149,199],"21%":[150],"relatively.":[151],"Finally,":[152],"three":[155],"another":[161],"4%":[162],"data.":[170],"Overall,":[171],"our":[172],"two-pass":[173],"system":[176],"with":[177],"Monotonic":[179],"Chunkwise":[180],"Attention":[181],"(MoChA)":[182],"pass":[186,193],"full-attention":[189],"second":[192],"42%":[200]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2022-05-05T00:00:00"}
