{"id":"https://openalex.org/W4375869050","doi":"https://doi.org/10.1109/icassp49357.2023.10095622","title":"Multi-Speaker Data Augmentation for Improved end-to-end Automatic Speech Recognition","display_name":"Multi-Speaker Data Augmentation for Improved end-to-end Automatic Speech Recognition","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4375869050","doi":"https://doi.org/10.1109/icassp49357.2023.10095622"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10095622","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp49357.2023.10095622","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"conference-paper","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101787514","display_name":"Samuel Thomas","orcid":"https://orcid.org/0000-0001-7573-0620"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Samuel Thomas","raw_affiliation_strings":["IBM Research AI"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research AI","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110252428","display_name":"Hong-Kwang Jeff Kuo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hong-Kwang J. Kuo","raw_affiliation_strings":["IBM Research AI"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research AI","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079994647","display_name":"George Saon","orcid":"https://orcid.org/0009-0004-6837-5009"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"George Saon","raw_affiliation_strings":["IBM Research AI"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research AI","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5003725957","display_name":"Brian Kingsbury","orcid":"https://orcid.org/0000-0002-1343-6837"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brian Kingsbury","raw_affiliation_strings":["IBM Research AI"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research AI","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.83040452003479},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.8035638332366943},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.7177228331565857},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.6828393936157227},{"id":"https://openalex.org/keywords/test-data","display_name":"Test data","score":0.47644785046577454},{"id":"https://openalex.org/keywords/dialog-box","display_name":"Dialog box","score":0.43433359265327454},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.37357115745544434}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.83040452003479},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.8035638332366943},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.7177228331565857},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.6828393936157227},{"id":"https://openalex.org/C16910744","wikidata":"https://www.wikidata.org/wiki/Q7705759","display_name":"Test data","level":2,"score":0.47644785046577454},{"id":"https://openalex.org/C173853756","wikidata":"https://www.wikidata.org/wiki/Q86915","display_name":"Dialog box","level":2,"score":0.43433359265327454},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37357115745544434},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10095622","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp49357.2023.10095622","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.6100000143051147,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W1828163288","https://openalex.org/W2095705004","https://openalex.org/W2407080277","https://openalex.org/W2936774411","https://openalex.org/W2937780860","https://openalex.org/W2962760690","https://openalex.org/W2963414781","https://openalex.org/W2973127116","https://openalex.org/W3007227084","https://openalex.org/W3008037978","https://openalex.org/W3008181812","https://openalex.org/W3015686596","https://openalex.org/W3152221657","https://openalex.org/W3163300396","https://openalex.org/W3196869722","https://openalex.org/W3197661863","https://openalex.org/W4221151563","https://openalex.org/W4225308107","https://openalex.org/W4286908472","https://openalex.org/W4372260590","https://openalex.org/W6638749077","https://openalex.org/W6674330103","https://openalex.org/W6840509714"],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W4297807400","https://openalex.org/W1491159402","https://openalex.org/W2144208207","https://openalex.org/W4389984014","https://openalex.org/W1509309911","https://openalex.org/W1940231550","https://openalex.org/W1599425004","https://openalex.org/W2118860825","https://openalex.org/W2096510939"],"abstract_inverted_index":{"Publicly":[0],"available":[1],"datasets":[2],"traditionally":[3],"used":[4,187],"to":[5,48,105,108,133,139,171,177],"train":[6],"E2E":[7,29,103],"ASR":[8,30],"models":[9,31],"for":[10],"conversational":[11],"telephone":[12],"speech":[13],"recognition":[14,36,173],"are":[15,46],"based":[16],"on":[17,25,35,50,129,154],"clean,":[18],"short":[19],"duration,":[20],"single":[21,83,95,130,142,156],"speaker":[22,84,89,96,109,131,143,157,160],"utterances":[23,115,132,186],"collected":[24],"separate":[26],"channels.":[27],"While":[28],"achieve":[32],"state-of-the-art":[33],"performance":[34,174],"tasks":[37],"that":[38,53,81],"match":[39],"well":[40],"with":[41],"such":[42],"training":[43,67,85],"data,":[44],"they":[45],"observed":[47],"fail":[49],"test":[51,161],"recordings":[52],"contain":[54],"multiple":[55,88,94,159],"speakers,":[56],"significant":[57],"channel":[58],"or":[59,62],"background":[60],"noise":[61],"span":[63],"longer":[64,114],"durations":[65],"than":[66],"data":[68,78,86,90],"utterances.":[69,97],"To":[70],"mitigate":[71],"these":[72],"issues,":[73],"we":[74,149],"propose":[75],"an":[76],"on-the-fly":[77],"augmentation":[79],"strategy":[80],"transforms":[82],"into":[87],"by":[91,124,175,179],"appending":[92],"together":[93],"The":[98],"proposed":[99,147],"technique":[100,148,168],"encourages":[101],"the":[102,119,146],"model":[104,120,127],"become":[106],"robust":[107],"changes":[110],"and":[111,158],"also":[112,122,164],"process":[113],"effectively.":[116],"During":[117],"training,":[118],"is":[121,169],"guided":[123],"a":[125],"teacher":[126],"trained":[128],"map":[134],"its":[135],"multi-speaker":[136],"encoder":[137],"embeddings":[138],"better":[140],"performing":[141],"representations.":[144],"With":[145],"obtain":[150],"7-14%":[151],"relative":[152],"improvement":[153],"various":[155],"sets.":[162],"We":[163],"show":[165],"how":[166],"this":[167],"able":[170],"improve":[172],"up":[176],"14%":[178],"capturing":[180],"useful":[181],"information":[182],"from":[183],"preceding":[184],"spoken":[185],"as":[188],"dialog":[189],"history.":[190]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2024,"cited_by_count":1}],"updated_date":"2026-07-14T23:27:15.235271","created_date":"2025-10-10T00:00:00"}