{"id":"https://openalex.org/W7146986385","doi":"https://doi.org/10.1016/j.csl.2026.101987","title":"Robust speech emotion recognition under human speech noise","display_name":"Robust speech emotion recognition under human speech noise","publication_year":2026,"publication_date":"2026-04-01","ids":{"openalex":"https://openalex.org/W7146986385","doi":"https://doi.org/10.1016/j.csl.2026.101987"},"language":"en","primary_location":{"id":"doi:10.1016/j.csl.2026.101987","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.csl.2026.101987","pdf_url":null,"source":{"id":"https://openalex.org/S91252481","display_name":"Computer Speech & Language","issn_l":"0885-2308","issn":["0885-2308","1095-8363"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computer Speech &amp; Language","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1016/j.csl.2026.101987","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132573059","display_name":"Jinyi Mi","orcid":null},"institutions":[{"id":"https://openalex.org/I60134161","display_name":"Nagoya University","ror":"https://ror.org/04chrp450","country_code":"JP","type":"education","lineage":["https://openalex.org/I60134161"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Jinyi Mi","raw_affiliation_strings":["Graduate School of Informatics, Nagoya University, Nagoya, 464-8601, Aichi, Japan"],"raw_orcid":"https://orcid.org/0009-0007-8554-0987","affiliations":[{"raw_affiliation_string":"Graduate School of Informatics, Nagoya University, Nagoya, 464-8601, Aichi, Japan","institution_ids":["https://openalex.org/I60134161"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101796991","display_name":"Xiaohan Shi","orcid":"https://orcid.org/0000-0002-1917-4479"},"institutions":[{"id":"https://openalex.org/I60134161","display_name":"Nagoya University","ror":"https://ror.org/04chrp450","country_code":"JP","type":"education","lineage":["https://openalex.org/I60134161"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Xiaohan Shi","raw_affiliation_strings":["Graduate School of Informatics, Nagoya University, Nagoya, 464-8601, Aichi, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Graduate School of Informatics, Nagoya University, Nagoya, 464-8601, Aichi, Japan","institution_ids":["https://openalex.org/I60134161"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132587372","display_name":"Ding Ma","orcid":null},"institutions":[{"id":"https://openalex.org/I60134161","display_name":"Nagoya University","ror":"https://ror.org/04chrp450","country_code":"JP","type":"education","lineage":["https://openalex.org/I60134161"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Ding Ma","raw_affiliation_strings":["Graduate School of Informatics, Nagoya University, Nagoya, 464-8601, Aichi, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Graduate School of Informatics, Nagoya University, Nagoya, 464-8601, Aichi, Japan","institution_ids":["https://openalex.org/I60134161"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132610152","display_name":"Jiajun He","orcid":null},"institutions":[{"id":"https://openalex.org/I60134161","display_name":"Nagoya University","ror":"https://ror.org/04chrp450","country_code":"JP","type":"education","lineage":["https://openalex.org/I60134161"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Jiajun He","raw_affiliation_strings":["Graduate School of Informatics, Nagoya University, Nagoya, 464-8601, Aichi, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Graduate School of Informatics, Nagoya University, Nagoya, 464-8601, Aichi, Japan","institution_ids":["https://openalex.org/I60134161"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132669676","display_name":"Takuya Fujimura","orcid":null},"institutions":[{"id":"https://openalex.org/I60134161","display_name":"Nagoya University","ror":"https://ror.org/04chrp450","country_code":"JP","type":"education","lineage":["https://openalex.org/I60134161"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Takuya Fujimura","raw_affiliation_strings":["Graduate School of Informatics, Nagoya University, Nagoya, 464-8601, Aichi, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Graduate School of Informatics, Nagoya University, Nagoya, 464-8601, Aichi, Japan","institution_ids":["https://openalex.org/I60134161"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5132628525","display_name":"Tomoki Toda","orcid":null},"institutions":[{"id":"https://openalex.org/I60134161","display_name":"Nagoya University","ror":"https://ror.org/04chrp450","country_code":"JP","type":"education","lineage":["https://openalex.org/I60134161"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Tomoki Toda","raw_affiliation_strings":["Information Technology Center, Nagoya University, Nagoya, 464-8601, Aichi, Japan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Information Technology Center, Nagoya University, Nagoya, 464-8601, Aichi, Japan","institution_ids":["https://openalex.org/I60134161"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5132573059"],"corresponding_institution_ids":["https://openalex.org/I60134161"],"apc_list":{"value":3400,"currency":"USD","value_usd":3400},"apc_paid":{"value":3400,"currency":"USD","value_usd":3400},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.72407481,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"100","issue":null,"first_page":"101987","last_page":"101987"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9302999973297119,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9302999973297119,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.025599999353289604,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.012799999676644802,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.7889999747276306},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.5325999855995178},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.4684999883174896},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.4422000050544739},{"id":"https://openalex.org/keywords/speech-enhancement","display_name":"Speech enhancement","score":0.4399000108242035},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.43700000643730164},{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.4104999899864197},{"id":"https://openalex.org/keywords/background-noise","display_name":"Background noise","score":0.3734000027179718}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8489999771118164},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.7889999747276306},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7788000106811523},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.5325999855995178},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.4684999883174896},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.4422000050544739},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.4399000108242035},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.43700000643730164},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4334999918937683},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.4104999899864197},{"id":"https://openalex.org/C100675267","wikidata":"https://www.wikidata.org/wiki/Q1371624","display_name":"Background noise","level":2,"score":0.3734000027179718},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.35760000348091125},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.3544999957084656},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.35280001163482666},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.35109999775886536},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.32350000739097595},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3140999972820282},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.31299999356269836},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.30160000920295715},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3001999855041504},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2824000120162964},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2583000063896179},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.2574999928474426}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1016/j.csl.2026.101987","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.csl.2026.101987","pdf_url":null,"source":{"id":"https://openalex.org/S91252481","display_name":"Computer Speech & Language","issn_l":"0885-2308","issn":["0885-2308","1095-8363"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computer Speech &amp; Language","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1016/j.csl.2026.101987","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.csl.2026.101987","pdf_url":null,"source":{"id":"https://openalex.org/S91252481","display_name":"Computer Speech & Language","issn_l":"0885-2308","issn":["0885-2308","1095-8363"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computer Speech &amp; Language","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320334764","display_name":"Japan Society for the Promotion of Science","ror":"https://ror.org/00hhkn466"},{"id":"https://openalex.org/F4320334789","display_name":"Japan Science and Technology Agency","ror":"https://ror.org/00097mb19"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W1976725440","https://openalex.org/W2074788634","https://openalex.org/W2082492120","https://openalex.org/W2132654227","https://openalex.org/W2146334809","https://openalex.org/W2509597859","https://openalex.org/W2803098682","https://openalex.org/W2885005742","https://openalex.org/W2951130829","https://openalex.org/W2982568434","https://openalex.org/W3205148967","https://openalex.org/W4205742757","https://openalex.org/W4321609205","https://openalex.org/W4361994820","https://openalex.org/W4385247697","https://openalex.org/W4402387459","https://openalex.org/W4409728756","https://openalex.org/W4410351567","https://openalex.org/W4410603637"],"related_works":[],"abstract_inverted_index":{"Speech":[0],"emotion":[1],"recognition":[2],"(SER)":[3],"is":[4,95],"the":[5,33,41,69,83,89,92,111,116,131,147,203],"task":[6],"of":[7,35,43,113,130,189,194,205],"predicting":[8],"human":[9,36,172,180,209],"emotions":[10],"from":[11,82],"speech":[12,37,81,149,173,181,210],"signals.":[13],"Building":[14],"robust":[15],"SER":[16,54,144,156,177,197],"systems":[17],"under":[18,170,179,207],"noisy":[19],"conditions":[20],"remains":[21],"challenging":[22],"due":[23],"to":[24,77,97,99,104,121],"different":[25],"noise":[26,150,182],"properties.":[27],"Most":[28],"previous":[29],"studies":[30,201],"have":[31],"overlooked":[32],"impact":[34],"noise,":[38],"thereby":[39],"limiting":[40],"applicability":[42],"SER.":[44],"To":[45],"address":[46],"this":[47],"issue,":[48],"we":[49,72],"propose":[50],"a":[51,63,74,123,142,154],"novel":[52],"two-stage":[53,155],"framework":[55,157],"that":[56,126,137],"combines":[57],"target":[58,84,117,132],"speaker":[59,85],"extraction":[60],"(TSE)":[61],"with":[62,115,160,183],"speaker-aware":[64,124],"fusion":[65],"network":[66],"(SAFN).":[67],"In":[68,88],"first":[70],"stage,":[71,91],"pretrain":[73],"TSE":[75,94,114,159,165],"model":[76],"effectively":[78],"extract":[79],"high-quality":[80],"in":[86,146],"mixtures.":[87],"second":[90],"pretrained":[93],"fine-tuned":[96],"adapt":[98],"emotional":[100,190],"contexts":[101],"while":[102],"continuing":[103],"suppress":[105],"interfering":[106],"speakers.":[107],"Additionally,":[108],"SAFN":[109,163,206],"integrates":[110],"output":[112],"speaker\u2019s":[118],"enrollment":[119,195],"information":[120],"generate":[122],"representation":[125],"emphasizes":[127],"emotion-relevant":[128],"cues":[129],"speaker.":[133],"Comparative":[134],"experiments":[135],"demonstrated":[136],"our":[138],"method":[139],"significantly":[140],"outperforms":[141],"typical":[143],"baseline":[145],"multi-speaker":[148,171,208],"conditions.":[151],"\u2022":[152,162,175,186,199],"Proposes":[153],"combining":[158],"SAFN.":[161],"and":[164,168,192],"enhance":[166],"accuracy":[167],"robustness":[169,204],"noise.":[174,211],"Investigates":[176],"performance":[178],"varying":[184],"attributes.":[185],"Analyzes":[187],"impacts":[188],"attributes":[191],"lengths":[193],"on":[196],"performance.":[198],"Ablation":[200],"verify":[202]},"counts_by_year":[],"updated_date":"2026-04-02T13:53:19.096889","created_date":"2026-04-02T00:00:00"}
