{"id":"https://openalex.org/W3198455051","doi":"https://doi.org/10.21437/interspeech.2021-1623","title":"Acoustic Data-Driven Subword Modeling for End-to-End Speech Recognition","display_name":"Acoustic Data-Driven Subword Modeling for End-to-End Speech Recognition","publication_year":2021,"publication_date":"2021-08-27","ids":{"openalex":"https://openalex.org/W3198455051","doi":"https://doi.org/10.21437/interspeech.2021-1623","mag":"3198455051"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2021-1623","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2021-1623","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2021","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100636970","display_name":"Wei Zhou","orcid":"https://orcid.org/0009-0006-3754-8872"},"institutions":[{"id":"https://openalex.org/I887968799","display_name":"RWTH Aachen University","ror":"https://ror.org/04xfq0f34","country_code":"DE","type":"education","lineage":["https://openalex.org/I887968799"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Wei Zhou","raw_affiliation_strings":["AppTek GmbH, 52062 Aachen, Germany","Human Language Technology and Pattern Recognition, Computer Science Department, RWTH Aachen University, 52074 Aachen, Germany"],"affiliations":[{"raw_affiliation_string":"AppTek GmbH, 52062 Aachen, Germany","institution_ids":[]},{"raw_affiliation_string":"Human Language Technology and Pattern Recognition, Computer Science Department, RWTH Aachen University, 52074 Aachen, Germany","institution_ids":["https://openalex.org/I887968799"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060987315","display_name":"Mohammad Zeineldeen","orcid":null},"institutions":[{"id":"https://openalex.org/I887968799","display_name":"RWTH Aachen University","ror":"https://ror.org/04xfq0f34","country_code":"DE","type":"education","lineage":["https://openalex.org/I887968799"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Mohammad Zeineldeen","raw_affiliation_strings":["Human Language Technology and Pattern Recognition, Computer Science Department, RWTH Aachen University, 52074 Aachen, Germany","AppTek GmbH, 52062 Aachen, Germany"],"affiliations":[{"raw_affiliation_string":"Human Language Technology and Pattern Recognition, Computer Science Department, RWTH Aachen University, 52074 Aachen, Germany","institution_ids":["https://openalex.org/I887968799"]},{"raw_affiliation_string":"AppTek GmbH, 52062 Aachen, Germany","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021556775","display_name":"Zuoyun Zheng","orcid":null},"institutions":[{"id":"https://openalex.org/I887968799","display_name":"RWTH Aachen University","ror":"https://ror.org/04xfq0f34","country_code":"DE","type":"education","lineage":["https://openalex.org/I887968799"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Zuoyun Zheng","raw_affiliation_strings":["Human Language Technology and Pattern Recognition, Computer Science Department, RWTH Aachen University, 52074 Aachen, Germany"],"affiliations":[{"raw_affiliation_string":"Human Language Technology and Pattern Recognition, Computer Science Department, RWTH Aachen University, 52074 Aachen, Germany","institution_ids":["https://openalex.org/I887968799"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088968292","display_name":"Ralf Schl\u00fcter","orcid":"https://orcid.org/0000-0003-2839-9247"},"institutions":[{"id":"https://openalex.org/I887968799","display_name":"RWTH Aachen University","ror":"https://ror.org/04xfq0f34","country_code":"DE","type":"education","lineage":["https://openalex.org/I887968799"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Ralf Schl\u00fcter","raw_affiliation_strings":["Human Language Technology and Pattern Recognition, Computer Science Department, RWTH Aachen University, 52074 Aachen, Germany","AppTek GmbH, 52062 Aachen, Germany"],"affiliations":[{"raw_affiliation_string":"Human Language Technology and Pattern Recognition, Computer Science Department, RWTH Aachen University, 52074 Aachen, Germany","institution_ids":["https://openalex.org/I887968799"]},{"raw_affiliation_string":"AppTek GmbH, 52062 Aachen, Germany","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5112501010","display_name":"Hermann Ney","orcid":null},"institutions":[{"id":"https://openalex.org/I887968799","display_name":"RWTH Aachen University","ror":"https://ror.org/04xfq0f34","country_code":"DE","type":"education","lineage":["https://openalex.org/I887968799"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Hermann Ney","raw_affiliation_strings":["AppTek GmbH, 52062 Aachen, Germany","Human Language Technology and Pattern Recognition, Computer Science Department, RWTH Aachen University, 52074 Aachen, Germany"],"affiliations":[{"raw_affiliation_string":"AppTek GmbH, 52062 Aachen, Germany","institution_ids":[]},{"raw_affiliation_string":"Human Language Technology and Pattern Recognition, Computer Science Department, RWTH Aachen University, 52074 Aachen, Germany","institution_ids":["https://openalex.org/I887968799"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100636970"],"corresponding_institution_ids":["https://openalex.org/I887968799"],"apc_list":null,"apc_paid":null,"fwci":1.2238,"has_fulltext":false,"cited_by_count":10,"citation_normalized_percentile":{"value":0.83422386,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"2886","last_page":"2890"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9858999848365784,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.865146279335022},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7241319417953491},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.708143949508667},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.506714403629303},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.4248344302177429},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.3056032359600067},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.28166237473487854}],"concepts":[{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.865146279335022},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7241319417953491},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.708143949508667},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.506714403629303},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.4248344302177429},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3056032359600067},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.28166237473487854}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2021-1623","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2021-1623","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2021","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.5400000214576721,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W125614575","https://openalex.org/W1494198834","https://openalex.org/W1828163288","https://openalex.org/W2110798204","https://openalex.org/W2121879602","https://openalex.org/W2262393948","https://openalex.org/W2327501763","https://openalex.org/W2471933213","https://openalex.org/W2748816379","https://openalex.org/W2899879954","https://openalex.org/W2903382683","https://openalex.org/W2938348542","https://openalex.org/W2952288254","https://openalex.org/W2962784628","https://openalex.org/W2962826786","https://openalex.org/W2963953073","https://openalex.org/W2963979492","https://openalex.org/W2972630480","https://openalex.org/W3008174054","https://openalex.org/W3008191852","https://openalex.org/W3008525923","https://openalex.org/W3095229326","https://openalex.org/W3097777922","https://openalex.org/W3160551958"],"related_works":["https://openalex.org/W1542012215","https://openalex.org/W202591681","https://openalex.org/W2566781703","https://openalex.org/W1888891122","https://openalex.org/W3150666256","https://openalex.org/W2019192795","https://openalex.org/W1607833327","https://openalex.org/W2167800425","https://openalex.org/W3015412285","https://openalex.org/W1995694396"],"abstract_inverted_index":{"Subword":[0],"units":[1,55],"are":[2,67],"commonly":[3],"used":[4],"for":[5,60,121],"end-to-end":[6,71],"automatic":[7],"speech":[8],"recognition":[9],"(ASR),":[10],"while":[11],"a":[12,43],"fully":[13,44],"acoustic-oriented":[14],"subword":[15,25,38,54,96,134],"modeling":[16,26,97],"approach":[17,28],"is":[18,119],"somewhat":[19],"missing.We":[20],"propose":[21],"an":[22],"acoustic":[23],"data-driven":[24],"(ADSM)":[27],"that":[29,85,104],"adapts":[30],"the":[31,81],"advantages":[32],"of":[33],"several":[34],"text-based":[35],"and":[36,48,56,77,94,112,117,124,136],"acousticbased":[37],"methods":[39],"into":[40],"one":[41],"pipeline.With":[42],"acousticoriented":[45],"label":[46],"design":[47],"learning":[49],"process,":[50],"ADSM":[51,65,86,105],"produces":[52],"acoustic-structured":[53],"acoustic-matched":[57],"target":[58],"sequence":[59,115],"further":[61],"ASR":[62,72],"training.The":[63],"obtained":[64],"labels":[66],"evaluated":[68],"with":[69],"different":[70],"approaches":[73],"including":[74],"CTC,":[75],"RNN-Transducer":[76],"attention":[78],"models.Experiments":[79],"on":[80],"LibriSpeech":[82],"corpus":[83],"show":[84],"clearly":[87],"outperforms":[88],"both":[89,122],"byte":[90],"pair":[91],"encoding":[92],"(BPE)":[93],"pronunciationassisted":[95],"(PASM)":[98],"in":[99],"all":[100],"cases.Detailed":[101],"analysis":[102],"shows":[103],"achieves":[106],"acoustically":[107],"more":[108,113],"logical":[109],"word":[110],"segmentation":[111,139],"balanced":[114],"length,":[116],"thus,":[118],"suitable":[120],"time-synchronous":[123],"label-synchronous":[125],"models.We":[126],"also":[127],"briefly":[128],"describe":[129],"how":[130],"to":[131],"apply":[132],"acoustic-based":[133],"regularization":[135],"unseen":[137],"text":[138],"using":[140],"ADSM.":[141]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":5}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
