{"id":"https://openalex.org/W4392931320","doi":"https://doi.org/10.1109/icassp48485.2024.10446991","title":"A Study on the Adverse Impact of Synthetic Speech on Speech Recognition","display_name":"A Study on the Adverse Impact of Synthetic Speech on Speech Recognition","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392931320","doi":"https://doi.org/10.1109/icassp48485.2024.10446991"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446991","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp48485.2024.10446991","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102631901","display_name":"Jian Huang","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jian Huang","raw_affiliation_strings":["Alibaba Group,China","Alibaba Group, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"Alibaba Group, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002946082","display_name":"Yancheng Bai","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yancheng Bai","raw_affiliation_strings":["Alibaba Group,China","Alibaba Group, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"Alibaba Group, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100575956","display_name":"Yang Cai","orcid":"https://orcid.org/0000-0002-4973-1598"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yang Cai","raw_affiliation_strings":["Alibaba Group,China","Alibaba Group, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"Alibaba Group, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5000803152","display_name":"Wei Bian","orcid":"https://orcid.org/0000-0003-4252-047X"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Bian","raw_affiliation_strings":["Alibaba Group,China","Alibaba Group, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"Alibaba Group, China","institution_ids":["https://openalex.org/I45928872"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5102631901"],"corresponding_institution_ids":["https://openalex.org/I45928872"],"apc_list":null,"apc_paid":null,"fwci":0.3637,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.61774459,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":"33","issue":null,"first_page":"10266","last_page":"10270"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8112778663635254},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7956903576850891},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6248483061790466},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.5392289161682129},{"id":"https://openalex.org/keywords/microphone","display_name":"Microphone","score":0.5359915494918823},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.47529447078704834},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.4659266173839569},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.4490915536880493},{"id":"https://openalex.org/keywords/audio-mining","display_name":"Audio mining","score":0.437451034784317},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.43321558833122253},{"id":"https://openalex.org/keywords/speech-corpus","display_name":"Speech corpus","score":0.41176721453666687},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.40247535705566406},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3709145188331604},{"id":"https://openalex.org/keywords/noise-reduction","display_name":"Noise reduction","score":0.14244449138641357}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8112778663635254},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7956903576850891},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6248483061790466},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.5392289161682129},{"id":"https://openalex.org/C2778263558","wikidata":"https://www.wikidata.org/wiki/Q46384","display_name":"Microphone","level":3,"score":0.5359915494918823},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.47529447078704834},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.4659266173839569},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.4490915536880493},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.437451034784317},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.43321558833122253},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.41176721453666687},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.40247535705566406},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3709145188331604},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.14244449138641357},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C68115822","wikidata":"https://www.wikidata.org/wiki/Q1068172","display_name":"Sound pressure","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446991","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp48485.2024.10446991","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W2963242190","https://openalex.org/W3006752097","https://openalex.org/W3013139777","https://openalex.org/W3016008406","https://openalex.org/W3080248383","https://openalex.org/W3092028330","https://openalex.org/W3134568285","https://openalex.org/W4225873749","https://openalex.org/W4226278401","https://openalex.org/W4281712850","https://openalex.org/W4283067311","https://openalex.org/W4292779060","https://openalex.org/W4297841605","https://openalex.org/W4298633873","https://openalex.org/W4313679638","https://openalex.org/W4372259861","https://openalex.org/W4378501656","https://openalex.org/W4381827575","https://openalex.org/W4385573729","https://openalex.org/W4385822407","https://openalex.org/W6778883912","https://openalex.org/W6783867762","https://openalex.org/W6810738896","https://openalex.org/W6847363464","https://openalex.org/W6848735303","https://openalex.org/W6853611000","https://openalex.org/W6853998256"],"related_works":["https://openalex.org/W2131711534","https://openalex.org/W2341426843","https://openalex.org/W3151376046","https://openalex.org/W3089379469","https://openalex.org/W1583620810","https://openalex.org/W4387712795","https://openalex.org/W642007152","https://openalex.org/W1911859126","https://openalex.org/W2009814707","https://openalex.org/W2903652364"],"abstract_inverted_index":{"The":[0,143],"high-quality":[1],"synthetic":[2,21,64,99,159,176,184,193],"speech":[3,22,31,48,65,67,91,129,160,185],"by":[4,39],"TTS":[5],"has":[6],"been":[7],"widely":[8],"used":[9],"in":[10,173,190],"the":[11,35,40,60,74,79,102,135,138,147,163,167,170,181,187],"field":[12],"of":[13,34,63,81,109,141,150,154,183],"human-computer":[14],"interaction,":[15],"bringing":[16],"users":[17],"better":[18,133],"experience.":[19],"However,":[20],"is":[23,152],"prone":[24],"to":[25,44,58,94,118,132],"be":[26],"mixed":[27],"with":[28],"real":[29,97,157],"human":[30],"as":[32],"part":[33],"noise":[36],"and":[37,85,98,158],"recorded":[38],"microphone,":[41],"which":[42],"leads":[43],"performance":[45,172,189],"decrease":[46],"for":[47],"recognition.":[49],"To":[50],"address":[51],"this":[52],"issue,":[53],"we":[54,77,105],"propose":[55,106],"different":[56],"methods":[57,108],"study":[59],"adverse":[61],"impact":[62],"on":[66],"recognition,":[68],"thereby":[69],"enhancing":[70],"its":[71],"robustness.":[72],"On":[73,101],"one":[75],"hand,":[76,104],"adopt":[78],"concept":[80],"fake":[82],"audio":[83],"detection":[84],"incorporate":[86],"an":[87],"additional":[88],"module":[89],"into":[90],"recognition":[92,130],"model":[93,131],"differentiate":[95],"between":[96,137,156],"speech.":[100,142],"other":[103],"various":[107],"incorporating":[110],"prompt":[111,122,164],"labels":[112,123,165],"from":[113],"a":[114,174,191],"language":[115],"semantics":[116],"perspective":[117],"achieve":[119],"differentiation.":[120],"These":[121],"provide":[124],"contextual":[125],"cues":[126],"that":[127],"help":[128],"understand":[134],"difference":[136],"two":[139],"types":[140],"experimental":[144],"results":[145],"demonstrate":[146],"acoustic":[148],"modeling":[149],"ASR":[151],"capable":[153],"distinguishing":[155],"effectively.":[161],"Putting":[162],"at":[166],"beginning":[168],"achieves":[169],"best":[171,188],"clean":[175],"data":[177,194],"scenario,":[178],"while":[179],"emptying":[180],"transcripts":[182],"obtains":[186],"noisy":[192],"scenario.":[195]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2025-12-25T23:11:45.687758","created_date":"2025-10-10T00:00:00"}
