{"id":"https://openalex.org/W3201953150","doi":"https://doi.org/10.1109/ijcnn52387.2021.9534170","title":"Two-Stage Pre-Training for Sequence to Sequence Speech Recognition","display_name":"Two-Stage Pre-Training for Sequence to Sequence Speech Recognition","publication_year":2021,"publication_date":"2021-07-18","ids":{"openalex":"https://openalex.org/W3201953150","doi":"https://doi.org/10.1109/ijcnn52387.2021.9534170","mag":"3201953150"},"language":"en","primary_location":{"id":"doi:10.1109/ijcnn52387.2021.9534170","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn52387.2021.9534170","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027251228","display_name":"Zhiyun Fan","orcid":"https://orcid.org/0000-0001-9180-7392"},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhiyun Fan","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences, China","School of Artificial Intelligence, University of Chinese Academy of Sciences, China"],"affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]},{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101400153","display_name":"Shiyu Zhou","orcid":"https://orcid.org/0000-0002-6889-0316"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shiyu Zhou","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences, China"],"affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5108642431","display_name":"Bo Xu","orcid":"https://orcid.org/0000-0002-1111-1529"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bo Xu","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences, China"],"affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5027251228"],"corresponding_institution_ids":["https://openalex.org/I19820366","https://openalex.org/I4210112150","https://openalex.org/I4210165038"],"apc_list":null,"apc_paid":null,"fwci":0.136,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.55310183,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.8092957139015198},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8033984899520874},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.7160658240318298},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.6675331592559814},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.6347606778144836},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5247419476509094},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4985229969024658},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.4810336232185364},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.4731903076171875},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.45326563715934753},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.451500803232193},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.45064765214920044},{"id":"https://openalex.org/keywords/character","display_name":"Character (mathematics)","score":0.4469550549983978},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.39450207352638245},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3830673396587372},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.26481765508651733},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.10449638962745667},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.06639739871025085}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.8092957139015198},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8033984899520874},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.7160658240318298},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.6675331592559814},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.6347606778144836},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5247419476509094},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4985229969024658},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.4810336232185364},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.4731903076171875},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.45326563715934753},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.451500803232193},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45064765214920044},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.4469550549983978},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.39450207352638245},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3830673396587372},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.26481765508651733},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.10449638962745667},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.06639739871025085},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ijcnn52387.2021.9534170","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn52387.2021.9534170","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.4300000071525574}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":43,"referenced_works":["https://openalex.org/W854541894","https://openalex.org/W1526236009","https://openalex.org/W1828163288","https://openalex.org/W1915251500","https://openalex.org/W2102113734","https://openalex.org/W2127141656","https://openalex.org/W2133564696","https://openalex.org/W2157331557","https://openalex.org/W2183341477","https://openalex.org/W2577366047","https://openalex.org/W2804648901","https://openalex.org/W2889048668","https://openalex.org/W2889213362","https://openalex.org/W2896457183","https://openalex.org/W2913318911","https://openalex.org/W2940322076","https://openalex.org/W2948947170","https://openalex.org/W2962826786","https://openalex.org/W2963242190","https://openalex.org/W2963340922","https://openalex.org/W2963341956","https://openalex.org/W2963403868","https://openalex.org/W2964308564","https://openalex.org/W2972694856","https://openalex.org/W2972706021","https://openalex.org/W2979476256","https://openalex.org/W2981991061","https://openalex.org/W2996383576","https://openalex.org/W3003875258","https://openalex.org/W4385245566","https://openalex.org/W6623517193","https://openalex.org/W6638749077","https://openalex.org/W6640059789","https://openalex.org/W6675365184","https://openalex.org/W6679434410","https://openalex.org/W6686164453","https://openalex.org/W6732447497","https://openalex.org/W6739901393","https://openalex.org/W6751433836","https://openalex.org/W6754473786","https://openalex.org/W6755207826","https://openalex.org/W6769196770","https://openalex.org/W6769238691"],"related_works":["https://openalex.org/W2146197305","https://openalex.org/W2134950286","https://openalex.org/W3081187864","https://openalex.org/W151018310","https://openalex.org/W4380605396","https://openalex.org/W3133352777","https://openalex.org/W2008737763","https://openalex.org/W2584084702","https://openalex.org/W2784059283","https://openalex.org/W3012383599"],"abstract_inverted_index":{"The":[0,39,108],"attention-based":[1],"encoder-decoder":[2,29],"structure":[3],"is":[4,111],"popular":[5],"in":[6],"automatic":[7],"speech":[8,36,62,70,85],"recognition":[9],"(ASR).":[10],"However,":[11],"it":[12],"relies":[13],"heavily":[14],"on":[15,113,151,158],"transcribed":[16],"data.":[17],"In":[18,51,76],"this":[19,120],"paper,":[20],"we":[21,56,81,118,139],"propose":[22],"a":[23,58,87,93],"novel":[24],"pre-training":[25,40,47,54,79,110],"strategy":[26],"for":[27,130],"the":[28,52,65,77,100,106,114,133,136],"sequence-to-sequence":[30],"(seq2seq)":[31],"model":[32,122],"by":[33,67],"utilizing":[34],"unpaired":[35],"and":[37,48,97,117,128,153],"transcripts.":[38],"process":[41],"consists":[42],"of":[43,61,90,126,135],"two":[44],"stages,":[45],"acoustic":[46,53],"linguistic":[49,78],"pre-training.":[50],"stage,":[55,80],"use":[57,99],"large":[59,88],"amount":[60],"to":[63,104,123,149,156],"pre-train":[64,105],"encoder":[66],"predicting":[68],"masked":[69],"feature":[71],"chunks":[72],"with":[73],"their":[74],"contexts.":[75],"first":[82],"generate":[83],"synthesized":[84,101],"from":[86,147,154],"number":[89],"transcripts":[91],"using":[92],"text-to-speech":[94],"(TTS)":[95],"system":[96],"then":[98],"paired":[102],"data":[103],"decoder.":[107],"two-stage":[109],"conducted":[112],"AISHELL-2":[115],"dataset,":[116],"apply":[119],"pre-trained":[121],"multiple":[124],"subsets":[125],"AISHELL-1":[127,152],"HKUST":[129],"post-training.":[131],"As":[132],"size":[134],"subset":[137],"increases,":[138],"obtain":[140],"relative":[141],"character":[142],"error":[143],"rate":[144],"reduction":[145],"(CERR)":[146],"38.24%":[148],"7.88%":[150],"12.00%":[155],"1.20%":[157],"HKUST.":[159]},"counts_by_year":[{"year":2023,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
