{"id":"https://openalex.org/W4408352360","doi":"https://doi.org/10.1109/icassp49660.2025.10888649","title":"SegAug: CTC-Aligned Segmented Augmentation For Robust RNN-Transducer Based Speech Recognition","display_name":"SegAug: CTC-Aligned Segmented Augmentation For Robust RNN-Transducer Based Speech Recognition","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408352360","doi":"https://doi.org/10.1109/icassp49660.2025.10888649"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10888649","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10888649","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5112684412","display_name":"Khanh Le","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Khanh Le","raw_affiliation_strings":["ZaloAI,Vietnam"],"affiliations":[{"raw_affiliation_string":"ZaloAI,Vietnam","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083304186","display_name":"Tuan Vu Ho","orcid":"https://orcid.org/0000-0001-6819-0443"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tuan Vu Ho","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Dung Tran","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dung Tran","raw_affiliation_strings":["Independent Researcher"],"affiliations":[{"raw_affiliation_string":"Independent Researcher","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5051044805","display_name":"Duc Thanh Chau","orcid":null},"institutions":[{"id":"https://openalex.org/I23582244","display_name":"Ho Chi Minh City University of Science","ror":"https://ror.org/05jfbgm49","country_code":"VN","type":"education","lineage":["https://openalex.org/I123565023","https://openalex.org/I23582244"]}],"countries":["VN"],"is_corresponding":false,"raw_author_name":"Duc Thanh Chau","raw_affiliation_strings":["Ho Chi Minh City University of Science"],"affiliations":[{"raw_affiliation_string":"Ho Chi Minh City University of Science","institution_ids":["https://openalex.org/I23582244"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5112684412"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.4566,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.9351558,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.983299970626831,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.9045000076293945,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6454206705093384},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6023485064506531},{"id":"https://openalex.org/keywords/transducer","display_name":"Transducer","score":0.540367066860199},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3525722026824951},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.2153552770614624},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.050718605518341064}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6454206705093384},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6023485064506531},{"id":"https://openalex.org/C56318395","wikidata":"https://www.wikidata.org/wiki/Q215928","display_name":"Transducer","level":2,"score":0.540367066860199},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3525722026824951},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.2153552770614624},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.050718605518341064}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10888649","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10888649","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Gender equality","id":"https://metadata.un.org/sdg/5","score":0.49000000953674316}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W2127141656","https://openalex.org/W2407080277","https://openalex.org/W2936774411","https://openalex.org/W2972650231","https://openalex.org/W3015686596","https://openalex.org/W3016234571","https://openalex.org/W3094833745","https://openalex.org/W3094979069","https://openalex.org/W3096615836","https://openalex.org/W3097777922","https://openalex.org/W3149629662","https://openalex.org/W3152221657","https://openalex.org/W3163839574","https://openalex.org/W3193590960","https://openalex.org/W3197478142","https://openalex.org/W3197976714","https://openalex.org/W3198439131","https://openalex.org/W3205201903","https://openalex.org/W4221167707","https://openalex.org/W4224518768","https://openalex.org/W4372340909","https://openalex.org/W4375869369","https://openalex.org/W4385245566","https://openalex.org/W6638749077"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2012283803","https://openalex.org/W4384820447","https://openalex.org/W2072454424","https://openalex.org/W2117438306","https://openalex.org/W2185942010","https://openalex.org/W2260725127","https://openalex.org/W2004297762"],"abstract_inverted_index":{"RNN-Transducer":[0],"(RNN-T)":[1],"is":[2],"a":[3,115,158],"widely":[4],"adopted":[5],"architecture":[6],"in":[7,15,30,153],"speech":[8,163],"recognition,":[9],"integrating":[10],"acoustic":[11,86],"and":[12,51,103,111,125,145,168],"language":[13,97],"modeling":[14],"an":[16,63],"end-to-end":[17],"framework.":[18],"However,":[19],"the":[20,80,90,109,133],"RNN-T":[21],"predictor":[22],"tends":[23],"to":[24,34,82,121],"over-rely":[25],"on":[26,85,108,123,127],"consecutive":[27],"word":[28],"dependencies":[29],"training":[31],"data,":[32],"leading":[33],"high":[35],"deletion":[36,101,138],"error":[37],"rates,":[38],"particularly":[39],"with":[40,73,140],"less":[41],"common":[42],"or":[43],"out-of-domain":[44],"phrases.":[45],"Existing":[46],"solutions,":[47],"such":[48],"as":[49],"regularization":[50],"data":[52],"augmentation,":[53],"often":[54],"compromise":[55],"other":[56],"aspects":[57],"of":[58,94,119,132,143],"performance.":[59,106],"We":[60],"propose":[61],"SegAug,":[62],"alignment-based":[64],"augmentation":[65],"technique":[66],"that":[67],"generates":[68],"contextually":[69],"varied":[70],"audio-text":[71],"pairs":[72],"low":[74],"sentence-level":[75],"semantics.":[76],"This":[77],"method":[78],"encourages":[79],"model":[81],"focus":[83],"more":[84],"features":[87],"while":[88],"diversifying":[89],"learned":[91],"textual":[92],"patterns":[93],"its":[95],"internal":[96],"model,":[98],"thereby":[99],"reducing":[100],"errors":[102],"enhancing":[104,162],"overall":[105],"Evaluations":[107],"LibriSpeech":[110],"Tedlium-v3":[112],"datasets":[113],"demonstrate":[114],"relative":[116,141],"WER":[117],"reduction":[118],"up":[120],"12.5%":[122],"small-scale":[124],"6.9%":[126],"large-scale":[128],"settings.":[129],"Notably,":[130],"most":[131],"improvement":[134],"stems":[135],"from":[136],"reduced":[137],"errors,":[139],"reductions":[142],"45.4%":[144],"18.5%,":[146],"respectively.":[147],"These":[148],"results":[149],"highlight":[150],"SegAug\u2019s":[151],"effectiveness":[152],"improving":[154],"RNN-T\u2019s":[155],"robustness,":[156],"offering":[157],"promising":[159],"solution":[160],"for":[161],"recognition":[164],"performance":[165],"across":[166],"diverse":[167],"challenging":[169],"scenarios.":[170]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
