{"id":"https://openalex.org/W4387935024","doi":"https://doi.org/10.1109/lsp.2023.3327585","title":"Key Frame Mechanism for Efficient Conformer Based End-to-End Speech Recognition","display_name":"Key Frame Mechanism for Efficient Conformer Based End-to-End Speech Recognition","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4387935024","doi":"https://doi.org/10.1109/lsp.2023.3327585"},"language":"en","primary_location":{"id":"doi:10.1109/lsp.2023.3327585","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/lsp.2023.3327585","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101880046","display_name":"Peng Fan","orcid":"https://orcid.org/0000-0002-0801-1893"},"institutions":[{"id":"https://openalex.org/I24185976","display_name":"Sichuan University","ror":"https://ror.org/011ashp19","country_code":"CN","type":"education","lineage":["https://openalex.org/I24185976"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Peng Fan","raw_affiliation_strings":["National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University, Chengdu, China","National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University, Chengdu 610065, China"],"affiliations":[{"raw_affiliation_string":"National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University, Chengdu, China","institution_ids":["https://openalex.org/I24185976"]},{"raw_affiliation_string":"National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University, Chengdu 610065, China","institution_ids":["https://openalex.org/I24185976"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042775692","display_name":"Changhao Shan","orcid":null},"institutions":[{"id":"https://openalex.org/I24185976","display_name":"Sichuan University","ror":"https://ror.org/011ashp19","country_code":"CN","type":"education","lineage":["https://openalex.org/I24185976"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Changhao Shan","raw_affiliation_strings":["Du Xiaoman Financial, Beijing, China","National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University, Chengdu 610065, China","Du Xiaoman Financial, Beijing 100089, China"],"affiliations":[{"raw_affiliation_string":"Du Xiaoman Financial, Beijing, China","institution_ids":[]},{"raw_affiliation_string":"National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University, Chengdu 610065, China","institution_ids":["https://openalex.org/I24185976"]},{"raw_affiliation_string":"Du Xiaoman Financial, Beijing 100089, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102880200","display_name":"Sining Sun","orcid":"https://orcid.org/0000-0002-2642-5096"},"institutions":[{"id":"https://openalex.org/I24185976","display_name":"Sichuan University","ror":"https://ror.org/011ashp19","country_code":"CN","type":"education","lineage":["https://openalex.org/I24185976"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sining Sun","raw_affiliation_strings":["Du Xiaoman Financial, Beijing, China","National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University, Chengdu 610065, China","Du Xiaoman Financial, Beijing 100089, China"],"affiliations":[{"raw_affiliation_string":"Du Xiaoman Financial, Beijing, China","institution_ids":[]},{"raw_affiliation_string":"National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University, Chengdu 610065, China","institution_ids":["https://openalex.org/I24185976"]},{"raw_affiliation_string":"Du Xiaoman Financial, Beijing 100089, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101044235","display_name":"Qing Hua Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I24185976","display_name":"Sichuan University","ror":"https://ror.org/011ashp19","country_code":"CN","type":"education","lineage":["https://openalex.org/I24185976"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qing Yang","raw_affiliation_strings":["Du Xiaoman Financial, Beijing, China","National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University, Chengdu 610065, China","Du Xiaoman Financial, Beijing 100089, China"],"affiliations":[{"raw_affiliation_string":"Du Xiaoman Financial, Beijing, China","institution_ids":[]},{"raw_affiliation_string":"National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University, Chengdu 610065, China","institution_ids":["https://openalex.org/I24185976"]},{"raw_affiliation_string":"Du Xiaoman Financial, Beijing 100089, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100326967","display_name":"Jianwei Zhang","orcid":"https://orcid.org/0000-0002-5491-1745"},"institutions":[{"id":"https://openalex.org/I24185976","display_name":"Sichuan University","ror":"https://ror.org/011ashp19","country_code":"CN","type":"education","lineage":["https://openalex.org/I24185976"]},{"id":"https://openalex.org/I4210125143","display_name":"Chengdu University","ror":"https://ror.org/034z67559","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210125143"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianwei Zhang","raw_affiliation_strings":["College of Computer Science, Sichuan University, Chengdu, China","National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University, Chengdu 610065, China","College of Computer Science, Sichuan University, Chengdu 610065, China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science, Sichuan University, Chengdu, China","institution_ids":["https://openalex.org/I4210125143","https://openalex.org/I24185976"]},{"raw_affiliation_string":"National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University, Chengdu 610065, China","institution_ids":["https://openalex.org/I24185976"]},{"raw_affiliation_string":"College of Computer Science, Sichuan University, Chengdu 610065, China","institution_ids":["https://openalex.org/I24185976"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101880046"],"corresponding_institution_ids":["https://openalex.org/I24185976"],"apc_list":null,"apc_paid":null,"fwci":0.3475,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.66733804,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":"30","issue":null,"first_page":"1612","last_page":"1616"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.992900013923645,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.8398812413215637},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6428542733192444},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.6171068549156189},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.5326995849609375},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.43045806884765625},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.32705408334732056},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.20384064316749573},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.09278565645217896}],"concepts":[{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.8398812413215637},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6428542733192444},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.6171068549156189},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.5326995849609375},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.43045806884765625},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.32705408334732056},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.20384064316749573},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.09278565645217896}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lsp.2023.3327585","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/lsp.2023.3327585","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.4699999988079071,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2127141656","https://openalex.org/W2144499799","https://openalex.org/W2327501763","https://openalex.org/W2507132449","https://openalex.org/W2892009249","https://openalex.org/W2936774411","https://openalex.org/W2962760690","https://openalex.org/W2963242190","https://openalex.org/W2963827914","https://openalex.org/W3097777922","https://openalex.org/W3160405885","https://openalex.org/W3162249256","https://openalex.org/W3196783077","https://openalex.org/W3197813307","https://openalex.org/W3198643121","https://openalex.org/W4221167707","https://openalex.org/W4281779489","https://openalex.org/W4362456643","https://openalex.org/W4372267461","https://openalex.org/W4372341576","https://openalex.org/W4375869197","https://openalex.org/W4385823095","https://openalex.org/W6675365184","https://openalex.org/W6739901393","https://openalex.org/W6838276489"],"related_works":["https://openalex.org/W3179968364","https://openalex.org/W1999612375","https://openalex.org/W2938107654","https://openalex.org/W2151749779","https://openalex.org/W3008587939","https://openalex.org/W3196421258","https://openalex.org/W4387301579","https://openalex.org/W2951281592","https://openalex.org/W2140798747","https://openalex.org/W855007925"],"abstract_inverted_index":{"Recently,":[0],"Conformer":[1,15,87,209],"as":[2,50,79,96,190,215],"a":[3,18,27,108],"backbone":[4],"network":[5,30],"for":[6,157],"end-to-end":[7],"automatic":[8],"speech":[9],"recognition":[10],"achieved":[11],"state-of-the-art":[12],"performance.":[13,38],"The":[14,122],"block":[16],"leverages":[17],"self-attention":[19,48,105,117],"mechanism":[20,118,167],"to":[21,31,111,142,149,168,180,192],"capture":[22,32],"global":[23],"information,":[24,34],"along":[25],"with":[26,46,55,93],"convolutional":[28],"neural":[29],"local":[33],"resulting":[35],"in":[36,185],"improved":[37],"However,":[39],"the":[40,47,56,59,82,86,91,102,113,116,132,144,151,162,177,193,198,237],"Conformer-based":[41],"model":[42,230],"encounters":[43],"an":[44,137],"issue":[45],"mechanism,":[49,107],"computational":[51],"complexity":[52],"grows":[53],"quadratically":[54],"length":[57],"of":[58,85,115,124],"input":[60,191],"sequence.":[61],"Inspired":[62],"by":[63],"previous":[64],"Connectionist":[65],"Temporal":[66],"Classification":[67],"(CTC)":[68],"guided":[69],"blank":[70,155,181],"skipping":[71],"during":[72,229],"decoding,":[73],"we":[74,100,135,160],"introduce":[75,101,136,161],"intermediate":[76,138],"CTC":[77,139],"outputs":[78],"guidance":[80],"into":[81],"downsampling":[83,165],"procedure":[84],"encoder.":[88,195],"We":[89],"define":[90],"frame":[92],"non-blank":[94],"output":[95],"key":[97,103,120,152,163],"frame.":[98],"Specifically,":[99],"frame-based":[104,164],"(KFSA)":[106],"novel":[109],"method":[110,221],"reduce":[112],"computation":[114],"using":[119,197],"frames.":[121],"structure":[123],"our":[125,219],"proposed":[126,199,220],"approach":[127],"comprises":[128],"two":[129],"encoders.":[130],"Following":[131],"initial":[133],"encoder,":[134],"loss":[140],"function":[141],"compute":[143],"label":[145],"frame,":[146],"enabling":[147],"us":[148],"extract":[150],"frames":[153,156,178,228],"and":[154,175,210,232],"KFSA.":[158],"Furthermore,":[159],"(KFDS)":[166],"operate":[169],"on":[170],"high-dimensional":[171],"acoustic":[172,187],"features":[173],"directly":[174],"drop":[176],"corresponding":[179],"labels,":[182],"which":[183,201,234],"results":[184],"new":[186],"feature":[188],"sequences":[189],"second":[194],"By":[196],"method,":[200],"achieves":[202],"comparable":[203],"or":[204],"higher":[205],"performance":[206],"than":[207,225],"vanilla":[208],"other":[211],"similar":[212],"work":[213],"such":[214],"Efficient":[216],"Conformer.":[217],"Meantime,":[218],"can":[222],"discard":[223],"more":[224],"60%":[226],"useless":[227],"training":[231],"inference,":[233],"will":[235],"accelerate":[236],"inference":[238],"speed":[239],"significantly.":[240]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-31T07:56:22.981413","created_date":"2025-10-10T00:00:00"}
