{"id":"https://openalex.org/W4385807435","doi":"https://doi.org/10.21437/interspeech.2023-1272","title":"Random Utterance Concatenation Based Data Augmentation for Improving Short-video Speech Recognition","display_name":"Random Utterance Concatenation Based Data Augmentation for Improving Short-video Speech Recognition","publication_year":2023,"publication_date":"2023-08-14","ids":{"openalex":"https://openalex.org/W4385807435","doi":"https://doi.org/10.21437/interspeech.2023-1272"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2023-1272","is_oa":false,"landing_page_url":"http://dx.doi.org/10.21437/interspeech.2023-1272","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"INTERSPEECH 2023","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5047521320","display_name":"Yist Y. Lin","orcid":"https://orcid.org/0009-0000-9054-1596"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yist Y. Lin","raw_affiliation_strings":["ByteDance Research"],"affiliations":[{"raw_affiliation_string":"ByteDance Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100696345","display_name":"Tao Han","orcid":"https://orcid.org/0000-0002-5543-0716"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tao Han","raw_affiliation_strings":["ByteDance Research"],"affiliations":[{"raw_affiliation_string":"ByteDance Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039696635","display_name":"Haihua Xu","orcid":"https://orcid.org/0000-0002-2220-8465"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Haihua Xu","raw_affiliation_strings":["ByteDance Research"],"affiliations":[{"raw_affiliation_string":"ByteDance Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046222772","display_name":"Van Tung Pham","orcid":"https://orcid.org/0000-0002-2103-1901"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Van Tung Pham","raw_affiliation_strings":["ByteDance Research"],"affiliations":[{"raw_affiliation_string":"ByteDance Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047310084","display_name":"Yerbolat Khassanov","orcid":"https://orcid.org/0000-0001-9422-6833"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yerbolat Khassanov","raw_affiliation_strings":["ByteDance Research"],"affiliations":[{"raw_affiliation_string":"ByteDance Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001136890","display_name":"Tze Yuang Chong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tze Yuang Chong","raw_affiliation_strings":["ByteDance Research"],"affiliations":[{"raw_affiliation_string":"ByteDance Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100702364","display_name":"Yi He","orcid":"https://orcid.org/0000-0002-5357-6623"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yi He","raw_affiliation_strings":["ByteDance Research"],"affiliations":[{"raw_affiliation_string":"ByteDance Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100366031","display_name":"Lu Lu","orcid":"https://orcid.org/0000-0002-6077-0977"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu Lu","raw_affiliation_strings":["ByteDance Research"],"affiliations":[{"raw_affiliation_string":"ByteDance Research","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101108160","display_name":"Zejun Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zejun Ma","raw_affiliation_strings":["ByteDance Research"],"affiliations":[{"raw_affiliation_string":"ByteDance Research","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5047521320"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.2033,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.43368592,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"904","last_page":"908"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9668999910354614,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9668999910354614,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.953000009059906,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/concatenation","display_name":"Concatenation (mathematics)","score":0.8426553010940552},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8014386892318726},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.7971758842468262},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7388019561767578},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4111034870147705},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3854164779186249}],"concepts":[{"id":"https://openalex.org/C87619178","wikidata":"https://www.wikidata.org/wiki/Q126002","display_name":"Concatenation (mathematics)","level":2,"score":0.8426553010940552},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8014386892318726},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.7971758842468262},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7388019561767578},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4111034870147705},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3854164779186249},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2023-1272","is_oa":false,"landing_page_url":"http://dx.doi.org/10.21437/interspeech.2023-1272","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"INTERSPEECH 2023","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","score":0.46000000834465027,"display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W648786980","https://openalex.org/W854541894","https://openalex.org/W1828163288","https://openalex.org/W2062227835","https://openalex.org/W2095705004","https://openalex.org/W2143612262","https://openalex.org/W2292443655","https://openalex.org/W2327501763","https://openalex.org/W2407080277","https://openalex.org/W2525778437","https://openalex.org/W2696967604","https://openalex.org/W2746314669","https://openalex.org/W2747135936","https://openalex.org/W2936774411","https://openalex.org/W2962784628","https://openalex.org/W2962824709","https://openalex.org/W2962826786","https://openalex.org/W2963250244","https://openalex.org/W2965116050","https://openalex.org/W2985287635","https://openalex.org/W3008181812","https://openalex.org/W3008480565","https://openalex.org/W3008898571","https://openalex.org/W3015889230","https://openalex.org/W3096815019","https://openalex.org/W3097777922","https://openalex.org/W3149629662","https://openalex.org/W3162244132","https://openalex.org/W3201283182","https://openalex.org/W4286908472","https://openalex.org/W4296069150","https://openalex.org/W4297841426","https://openalex.org/W4385245566"],"related_works":["https://openalex.org/W1596769518","https://openalex.org/W2894336231","https://openalex.org/W2886693075","https://openalex.org/W2588431733","https://openalex.org/W3198503472","https://openalex.org/W4307784074","https://openalex.org/W4385807435","https://openalex.org/W2095344869","https://openalex.org/W4306291601","https://openalex.org/W3206503703"],"abstract_inverted_index":{"One":[0],"of":[1],"limitations":[2],"in":[3],"end-to-end":[4],"automatic":[5],"speech":[6,65],"recognition":[7,104],"(ASR)":[8],"framework":[9],"is":[10,80],"its":[11],"performance":[12,106],"would":[13],"be":[14,59],"compromised":[15],"if":[16],"train-test":[17,38],"utterance":[18,29,39,73,103,128],"lengths":[19],"are":[20,48],"mismatched.In":[21],"this":[22],"paper,":[23],"we":[24,47],"propose":[25],"an":[26],"on-the-fly":[27],"random":[28],"concatenation":[30],"(RUC)":[31],"based":[32],"data":[33],"augmentation":[34],"method":[35,99],"to":[36,58,91,126],"alleviate":[37],"length":[40],"mismatch":[41,88],"issue":[42],"for":[43,62,120],"short-video":[44,63],"ASR":[45],"task.Specifically,":[46],"motivated":[49],"by":[50],"observations":[51],"that":[52],"our":[53,71],"human-transcribed":[54],"training":[55],"utterances":[56],"tend":[57],"much":[60,81],"shorter":[61],"spontaneous":[64],"(\u223c3":[66],"seconds":[67,84],"on":[68,85,108,118],"average),":[69],"while":[70],"test":[72],"generated":[74],"from":[75],"voice":[76],"activity":[77],"detection":[78],"front-end":[79],"longer":[82],"(\u223c10":[83],"average).Such":[86],"a":[87],"can":[89],"lead":[90],"suboptimal":[92],"performance.Empirically,":[93],"it's":[94],"observed":[95],"the":[96],"proposed":[97],"RUC":[98],"significantly":[100],"improves":[101],"long":[102],"without":[105],"drop":[107],"short":[109],"one.Overall,":[110],"it":[111],"achieves":[112],"5.72%":[113],"word":[114],"error":[115],"rate":[116],"reduction":[117],"average":[119],"15":[121],"languages":[122],"and":[123],"improved":[124],"robustness":[125],"various":[127],"length.":[129]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2025-12-21T01:58:51.020947","created_date":"2025-10-10T00:00:00"}
