{"id":"https://openalex.org/W3207815550","doi":"https://doi.org/10.1109/taslp.2021.3120632","title":"Keyword Search Using Attention-Based End-to-End ASR and Frame-Synchronous Phoneme Alignments","display_name":"Keyword Search Using Attention-Based End-to-End ASR and Frame-Synchronous Phoneme Alignments","publication_year":2021,"publication_date":"2021-01-01","ids":{"openalex":"https://openalex.org/W3207815550","doi":"https://doi.org/10.1109/taslp.2021.3120632","mag":"3207815550"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2021.3120632","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2021.3120632","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5035774762","display_name":"Runyan Yang","orcid":"https://orcid.org/0000-0003-3466-6882"},"institutions":[{"id":"https://openalex.org/I4210099069","display_name":"Institute of Acoustics","ror":"https://ror.org/00v8rqv75","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210099069"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Runyan Yang","raw_affiliation_strings":["Key Laboratory of Speech Acoustics and Content Understanding, Institute of Acoustics, Chinese Academy of Sciences, Beijing, China#TAB#"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Speech Acoustics and Content Understanding, Institute of Acoustics, Chinese Academy of Sciences, Beijing, China#TAB#","institution_ids":["https://openalex.org/I4210099069","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072887461","display_name":"Gaofeng Cheng","orcid":"https://orcid.org/0000-0002-2102-6061"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210099069","display_name":"Institute of Acoustics","ror":"https://ror.org/00v8rqv75","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210099069"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Gaofeng Cheng","raw_affiliation_strings":["Key Laboratory of Speech Acoustics and Content Understanding, Institute of Acoustics, Chinese Academy of Sciences, Beijing, China#TAB#"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Speech Acoustics and Content Understanding, Institute of Acoustics, Chinese Academy of Sciences, Beijing, China#TAB#","institution_ids":["https://openalex.org/I4210099069","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027112640","display_name":"Haoran Miao","orcid":"https://orcid.org/0000-0001-8017-0429"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210099069","display_name":"Institute of Acoustics","ror":"https://ror.org/00v8rqv75","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210099069"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haoran Miao","raw_affiliation_strings":["Key Laboratory of Speech Acoustics and Content Understanding, Institute of Acoustics, Chinese Academy of Sciences, Beijing, China#TAB#"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Speech Acoustics and Content Understanding, Institute of Acoustics, Chinese Academy of Sciences, Beijing, China#TAB#","institution_ids":["https://openalex.org/I4210099069","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052454197","display_name":"Li Ta","orcid":"https://orcid.org/0000-0001-5431-9787"},"institutions":[{"id":"https://openalex.org/I4210099069","display_name":"Institute of Acoustics","ror":"https://ror.org/00v8rqv75","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210099069"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ta Li","raw_affiliation_strings":["Key Laboratory of Speech Acoustics and Content Understanding, Institute of Acoustics, Chinese Academy of Sciences, Beijing, China#TAB#"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Speech Acoustics and Content Understanding, Institute of Acoustics, Chinese Academy of Sciences, Beijing, China#TAB#","institution_ids":["https://openalex.org/I4210099069","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036446253","display_name":"Pengyuan Zhang","orcid":"https://orcid.org/0000-0001-6838-5160"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210099069","display_name":"Institute of Acoustics","ror":"https://ror.org/00v8rqv75","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210099069"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pengyuan Zhang","raw_affiliation_strings":["Key Laboratory of Speech Acoustics and Content Understanding, Institute of Acoustics, Chinese Academy of Sciences, Beijing, China#TAB#"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Speech Acoustics and Content Understanding, Institute of Acoustics, Chinese Academy of Sciences, Beijing, China#TAB#","institution_ids":["https://openalex.org/I4210099069","https://openalex.org/I19820366"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100425112","display_name":"Yonghong Yan","orcid":"https://orcid.org/0000-0001-6907-5770"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210099069","display_name":"Institute of Acoustics","ror":"https://ror.org/00v8rqv75","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210099069"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yonghong Yan","raw_affiliation_strings":["Key Laboratory of Speech Acoustics and Content Understanding, Institute of Acoustics, Chinese Academy of Sciences, Beijing, China#TAB#"],"affiliations":[{"raw_affiliation_string":"Key Laboratory of Speech Acoustics and Content Understanding, Institute of Acoustics, Chinese Academy of Sciences, Beijing, China#TAB#","institution_ids":["https://openalex.org/I4210099069","https://openalex.org/I19820366"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5035774762"],"corresponding_institution_ids":["https://openalex.org/I19820366","https://openalex.org/I4210099069"],"apc_list":null,"apc_paid":null,"fwci":2.0395,"has_fulltext":false,"cited_by_count":20,"citation_normalized_percentile":{"value":0.8917054,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":93,"max":98},"biblio":{"volume":"29","issue":null,"first_page":"3202","last_page":"3215"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8812411427497864},{"id":"https://openalex.org/keywords/connectionism","display_name":"Connectionism","score":0.5612362027168274},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5261535048484802},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.49384409189224243},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4778591990470886},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.45594581961631775},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.45589306950569153},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.4263092279434204},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.4156866669654846},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.3538801074028015}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8812411427497864},{"id":"https://openalex.org/C8521452","wikidata":"https://www.wikidata.org/wiki/Q203790","display_name":"Connectionism","level":3,"score":0.5612362027168274},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5261535048484802},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.49384409189224243},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4778591990470886},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.45594581961631775},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.45589306950569153},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.4263092279434204},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.4156866669654846},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3538801074028015},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2021.3120632","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2021.3120632","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.6399999856948853}],"awards":[{"id":"https://openalex.org/G387038611","display_name":null,"funder_award_id":"2020AAA0108002","funder_id":"https://openalex.org/F4320336026","funder_display_name":"National Key Research and Development Program of China Stem Cell and Translational Research"}],"funders":[{"id":"https://openalex.org/F4320336026","display_name":"National Key Research and Development Program of China Stem Cell and Translational Research","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":87,"referenced_works":["https://openalex.org/W6908809","https://openalex.org/W46679369","https://openalex.org/W49437105","https://openalex.org/W854541894","https://openalex.org/W1522301498","https://openalex.org/W1524333225","https://openalex.org/W1526236009","https://openalex.org/W1631260214","https://openalex.org/W1824659232","https://openalex.org/W1993882792","https://openalex.org/W2019029064","https://openalex.org/W2064675550","https://openalex.org/W2080401855","https://openalex.org/W2089537772","https://openalex.org/W2095705004","https://openalex.org/W2097685957","https://openalex.org/W2102113734","https://openalex.org/W2122797512","https://openalex.org/W2127141656","https://openalex.org/W2133564696","https://openalex.org/W2143612262","https://openalex.org/W2158698691","https://openalex.org/W2160815625","https://openalex.org/W2166637769","https://openalex.org/W2167338739","https://openalex.org/W2183341477","https://openalex.org/W2212465773","https://openalex.org/W2293634267","https://openalex.org/W2293858598","https://openalex.org/W2327501763","https://openalex.org/W2407080277","https://openalex.org/W2442329935","https://openalex.org/W2526425061","https://openalex.org/W2531327146","https://openalex.org/W2578392894","https://openalex.org/W2612800309","https://openalex.org/W2697044473","https://openalex.org/W2747135936","https://openalex.org/W2750499125","https://openalex.org/W2754134153","https://openalex.org/W2766219058","https://openalex.org/W2944180060","https://openalex.org/W2961149267","https://openalex.org/W2962760690","https://openalex.org/W2962778134","https://openalex.org/W2962780374","https://openalex.org/W2962824709","https://openalex.org/W2962826786","https://openalex.org/W2963403868","https://openalex.org/W2963414149","https://openalex.org/W2963414781","https://openalex.org/W2963827914","https://openalex.org/W2963970535","https://openalex.org/W2964121744","https://openalex.org/W2964199361","https://openalex.org/W2964308564","https://openalex.org/W2964323211","https://openalex.org/W2966163367","https://openalex.org/W2972389417","https://openalex.org/W3002595344","https://openalex.org/W3007227084","https://openalex.org/W3095321517","https://openalex.org/W3097747488","https://openalex.org/W3163237592","https://openalex.org/W3198850376","https://openalex.org/W4385245566","https://openalex.org/W6600284362","https://openalex.org/W6601894380","https://openalex.org/W6602023532","https://openalex.org/W6623517193","https://openalex.org/W6631190155","https://openalex.org/W6631362777","https://openalex.org/W6636811518","https://openalex.org/W6638691251","https://openalex.org/W6673287344","https://openalex.org/W6674330103","https://openalex.org/W6675365184","https://openalex.org/W6679434410","https://openalex.org/W6688089860","https://openalex.org/W6696934422","https://openalex.org/W6713762819","https://openalex.org/W6718561954","https://openalex.org/W6728622933","https://openalex.org/W6739901393","https://openalex.org/W6744105228","https://openalex.org/W6765779497","https://openalex.org/W6790328491"],"related_works":["https://openalex.org/W17155033","https://openalex.org/W3207760230","https://openalex.org/W1496222301","https://openalex.org/W4312814274","https://openalex.org/W1590307681","https://openalex.org/W2536018345","https://openalex.org/W4285370786","https://openalex.org/W2296488620","https://openalex.org/W2358353312","https://openalex.org/W2916997151"],"abstract_inverted_index":{"Attention-based":[0],"end-to-end":[1],"(E2E)":[2],"automatic":[3],"speech":[4],"recognition":[5,15],"(ASR)":[6],"architectures":[7],"are":[8],"now":[9],"the":[10,38,74,87,95,102,108,135,151,186,193,214],"state-of-the-art":[11],"in":[12,27,158,165],"terms":[13],"of":[14,91,160,188],"performance.":[16],"However,":[17],"despite":[18],"their":[19],"effectiveness,":[20],"they":[21],"have":[22],"not":[23],"been":[24],"widely":[25],"applied":[26],"keyword":[28,54],"search":[29],"(KWS)":[30],"tasks":[31],"yet.":[32],"In":[33],"this":[34,125,191],"paper,":[35],"we":[36,58,100,149],"propose":[37],"Att-E2E-KWS":[39,137,196,210],"architecture,":[40],"an":[41],"attention-based":[42,69],"E2E":[43,70,82,117,216],"ASR":[44,83,217],"framework":[45,62],"for":[46],"KWS":[47,66,219],"that":[48,207],"can":[49,133],"afford":[50],"accurate":[51,141],"and":[52,78,85,106,120,144,172,182,198,203],"reliable":[53,145],"retrieval":[55],"results.":[56],"First,":[57],"design":[59],"a":[60,153],"basic":[61],"to":[63,93,113,167],"carry":[64],"out":[65],"based":[67,218],"on":[68,180,201],"ASR.":[71],"We":[72,123,176],"adopt":[73],"connectionist":[75],"temporal":[76],"classification":[77],"attention":[79],"(CTC/Att)":[80],"joint":[81],"architecture":[84],"exploit":[86],"spike":[88],"posterior":[89],"property":[90],"CTC":[92,215],"provide":[94,114,134],"keywords":[96],"time":[97,129,142],"stamps.":[98],"Second,":[99],"introduce":[101],"frame-synchronous":[103],"phonemes":[104],"modeling":[105],"use":[107,150],"dynamic":[109,128],"programming":[110],"(DP)":[111],"algorithm":[112],"alignments":[115],"between":[116],"grapheme":[118],"outputs":[119],"phoneme":[121],"outputs.":[122],"call":[124],"alignment":[126,130],"procedure":[127],"(DTA),":[131],"which":[132],"proposed":[136,209],"system":[138],"with":[139],"more":[140,169],"stamps":[143],"confidence":[146],"scores.":[147],"Third,":[148],"Transformer,":[152],"self-attention-based":[154],"encoder-decoder":[155],"neural":[156,163],"network,":[157],"place":[159],"conventional":[161],"recurrent":[162],"networks":[164],"order":[166],"yield":[168],"parallelizable":[170],"models":[171],"increased":[173],"training":[174],"speed.":[175],"conduct":[177],"comprehensive":[178],"experiments":[179],"English":[181],"Mandarin":[183],"Chinese.":[184],"To":[185],"best":[187],"our":[189,208],"knowledge,":[190],"is":[192],"first":[194],"practical":[195],"framework,":[197],"experimental":[199],"results":[200],"Switchboard":[202],"HKUST":[204],"corpora":[205],"show":[206],"systems":[211],"significantly":[212],"outperform":[213],"baselines.":[220]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":4},{"year":2021,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
