{"id":"https://openalex.org/W4416799058","doi":"https://doi.org/10.1109/apsipaasc65261.2025.11249181","title":"End-to-End Integration of Speech Emotion Recognition and Voice Activity Detection with a Self-Supervised Model for Noise Robustness","display_name":"End-to-End Integration of Speech Emotion Recognition and Voice Activity Detection with a Self-Supervised Model for Noise Robustness","publication_year":2025,"publication_date":"2025-10-22","ids":{"openalex":"https://openalex.org/W4416799058","doi":"https://doi.org/10.1109/apsipaasc65261.2025.11249181"},"language":null,"primary_location":{"id":"doi:10.1109/apsipaasc65261.2025.11249181","is_oa":false,"landing_page_url":"https://doi.org/10.1109/apsipaasc65261.2025.11249181","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5018847693","display_name":"Natsuo Yamashita","orcid":null},"institutions":[{"id":"https://openalex.org/I65143321","display_name":"Hitachi (Japan)","ror":"https://ror.org/02exqgm79","country_code":"JP","type":"company","lineage":["https://openalex.org/I65143321"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Natsuo Yamashita","raw_affiliation_strings":["Hitachi, Ltd.,Research &#x0026; Development Group,Japan"],"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd.,Research &#x0026; Development Group,Japan","institution_ids":["https://openalex.org/I65143321"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047858866","display_name":"Masaaki Yamamoto","orcid":"https://orcid.org/0000-0002-8309-8669"},"institutions":[{"id":"https://openalex.org/I65143321","display_name":"Hitachi (Japan)","ror":"https://ror.org/02exqgm79","country_code":"JP","type":"company","lineage":["https://openalex.org/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Masaaki Yamamoto","raw_affiliation_strings":["Hitachi, Ltd.,Research &#x0026; Development Group,Japan"],"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd.,Research &#x0026; Development Group,Japan","institution_ids":["https://openalex.org/I65143321"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5032619330","display_name":"Yohei Kawaguchi","orcid":"https://orcid.org/0000-0002-1027-5038"},"institutions":[{"id":"https://openalex.org/I65143321","display_name":"Hitachi (Japan)","ror":"https://ror.org/02exqgm79","country_code":"JP","type":"company","lineage":["https://openalex.org/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yohei Kawaguchi","raw_affiliation_strings":["Hitachi, Ltd.,Research &#x0026; Development Group,Japan"],"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd.,Research &#x0026; Development Group,Japan","institution_ids":["https://openalex.org/I65143321"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5018847693"],"corresponding_institution_ids":["https://openalex.org/I65143321"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.41232974,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"537","last_page":"542"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.5738999843597412,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.5738999843597412,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.16769999265670776,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.15360000729560852,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.7437000274658203},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.656499981880188},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.4602999985218048},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.447299987077713},{"id":"https://openalex.org/keywords/background-noise","display_name":"Background noise","score":0.3880999982357025},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.37369999289512634},{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.3628000020980835},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.3601999878883362}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7843999862670898},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7437000274658203},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.7437000274658203},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.656499981880188},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.4602999985218048},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.447299987077713},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4052000045776367},{"id":"https://openalex.org/C100675267","wikidata":"https://www.wikidata.org/wiki/Q1371624","display_name":"Background noise","level":2,"score":0.3880999982357025},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.37369999289512634},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.3628000020980835},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3601999878883362},{"id":"https://openalex.org/C2779679103","wikidata":"https://www.wikidata.org/wiki/Q5251805","display_name":"Degradation (telecommunications)","level":2,"score":0.3447999954223633},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.32510000467300415},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.31949999928474426},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3183000087738037},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.3165999948978424},{"id":"https://openalex.org/C34146451","wikidata":"https://www.wikidata.org/wiki/Q5048094","display_name":"Cascade","level":2,"score":0.31040000915527344},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.31040000915527344},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.2793000042438507},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.2531999945640564}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/apsipaasc65261.2025.11249181","is_oa":false,"landing_page_url":"https://doi.org/10.1109/apsipaasc65261.2025.11249181","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1972570464","https://openalex.org/W2102697000","https://openalex.org/W2108819501","https://openalex.org/W2111926505","https://openalex.org/W2129120544","https://openalex.org/W2137639365","https://openalex.org/W2146334809","https://openalex.org/W2578895956","https://openalex.org/W2922711788","https://openalex.org/W2948972667","https://openalex.org/W2959546144","https://openalex.org/W2978238286","https://openalex.org/W2995181338","https://openalex.org/W3096918678","https://openalex.org/W3119308075","https://openalex.org/W3122349645","https://openalex.org/W3160747466","https://openalex.org/W3161282967","https://openalex.org/W3193786050","https://openalex.org/W3197580070","https://openalex.org/W3197642003","https://openalex.org/W3198694222","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4221162872","https://openalex.org/W4224293426","https://openalex.org/W4226380987","https://openalex.org/W4375869379","https://openalex.org/W4385245566","https://openalex.org/W4385823101","https://openalex.org/W4392903780","https://openalex.org/W4402112145"],"related_works":[],"abstract_inverted_index":{"Speech":[0],"Emotion":[1],"Recognition":[2],"(SER)":[3],"often":[4],"operates":[5],"on":[6,108],"speech":[7,27,91],"segments":[8,28,92],"detected":[9],"by":[10],"a":[11,18,78],"Voice":[12],"Activity":[13],"Detection":[14],"(VAD)":[15],"model":[16,99],"in":[17,35,39,42],"cascade":[19],"manner.":[20],"However,":[21],"VAD":[22,58,70,86,104,123],"models":[23,73],"may":[24],"output":[25],"flawed":[26,103],"including":[29],"noise-only":[30],"or":[31],"nonemotional":[32],"segments,":[33],"especially":[34],"noisy":[36],"environments,":[37],"resulting":[38],"performance":[40,119],"degradation":[41],"subsequent":[43],"SER":[44,60,72,98,118],"models.":[45],"To":[46],"address":[47],"this":[48],"issue,":[49],"we":[50,126],"propose":[51],"an":[52],"end-to-end":[53],"(E2E)":[54],"method":[55,116,129],"that":[56,113],"integrates":[57],"and":[59,71,138],"using":[61],"Self-Supervised":[62],"Learning":[63],"(SSL)":[64],"features.":[65],"By":[66],"jointly":[67],"training":[68],"both":[69],"with":[74],"SSL":[75],"features":[76],"for":[77,93],"combined":[79],"loss":[80,136],"function,":[81],"our":[82,114,128],"approach":[83],"enables":[84],"the":[85,97,102,109,122],"module":[87],"to":[88],"capture":[89],"emotional":[90],"SER,":[94],"while":[95],"making":[96],"robust":[100],"against":[101],"output.":[105],"Experimental":[106],"results":[107],"IEMOCAP":[110],"dataset":[111],"demonstrate":[112],"proposed":[115],"improves":[117],"without":[120],"optimizing":[121],"threshold.":[124],"Furthermore,":[125],"analyze":[127],"under":[130],"various":[131],"conditions,":[132],"such":[133],"as":[134],"different":[135],"weights":[137],"noise":[139],"levels.":[140]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-28T00:00:00"}
