{"id":"https://openalex.org/W4392982123","doi":"https://doi.org/10.1109/cvmi59935.2023.10464855","title":"Enhancing Emotion Classification Through Speech and Correlated Emotional Sounds via a Variational Auto-Encoder Model with Prosodic Regularization","display_name":"Enhancing Emotion Classification Through Speech and Correlated Emotional Sounds via a Variational Auto-Encoder Model with Prosodic Regularization","publication_year":2023,"publication_date":"2023-12-10","ids":{"openalex":"https://openalex.org/W4392982123","doi":"https://doi.org/10.1109/cvmi59935.2023.10464855"},"language":"en","primary_location":{"id":"doi:10.1109/cvmi59935.2023.10464855","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvmi59935.2023.10464855","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE International Conference on Computer Vision and Machine Intelligence (CVMI)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5057533012","display_name":"Andrea Veronica Porco","orcid":null},"institutions":[{"id":"https://openalex.org/I32915989","display_name":"University of the Ryukyus","ror":"https://ror.org/02z1n9q24","country_code":"JP","type":"education","lineage":["https://openalex.org/I32915989"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Andrea Veronica Porco","raw_affiliation_strings":["University of The Ryukyus,Dept. Information Engineering,Nishihara,Japan","Dept. Information Engineering, University of The Ryukyus, Nishihara, Japan"],"affiliations":[{"raw_affiliation_string":"University of The Ryukyus,Dept. Information Engineering,Nishihara,Japan","institution_ids":["https://openalex.org/I32915989"]},{"raw_affiliation_string":"Dept. Information Engineering, University of The Ryukyus, Nishihara, Japan","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5069566710","display_name":"Dongshik Kang","orcid":"https://orcid.org/0009-0006-5465-0780"},"institutions":[{"id":"https://openalex.org/I32915989","display_name":"University of the Ryukyus","ror":"https://ror.org/02z1n9q24","country_code":"JP","type":"education","lineage":["https://openalex.org/I32915989"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Dongshik Kang","raw_affiliation_strings":["University of The Ryukyus,Dept. Information Engineering,Nishihara,Japan","Dept. Information Engineering, University of The Ryukyus, Nishihara, Japan"],"affiliations":[{"raw_affiliation_string":"University of The Ryukyus,Dept. Information Engineering,Nishihara,Japan","institution_ids":["https://openalex.org/I32915989"]},{"raw_affiliation_string":"Dept. Information Engineering, University of The Ryukyus, Nishihara, Japan","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5057533012"],"corresponding_institution_ids":["https://openalex.org/I32915989"],"apc_list":null,"apc_paid":null,"fwci":0.2616,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.63390577,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.7753000259399414,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.7753000259399414,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.7210372686386108},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6741296648979187},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.6223486065864563},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5790215134620667},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5769714713096619},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.41715577244758606},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.1782645583152771}],"concepts":[{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.7210372686386108},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6741296648979187},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.6223486065864563},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5790215134620667},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5769714713096619},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41715577244758606},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.1782645583152771},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cvmi59935.2023.10464855","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvmi59935.2023.10464855","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE International Conference on Computer Vision and Machine Intelligence (CVMI)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.4699999988079071,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W2162295204","https://openalex.org/W2515020857","https://openalex.org/W2772766867","https://openalex.org/W2780113219","https://openalex.org/W2803193013","https://openalex.org/W2883496341","https://openalex.org/W2888792135","https://openalex.org/W2901552243","https://openalex.org/W2952665519","https://openalex.org/W2962691331","https://openalex.org/W2963087613","https://openalex.org/W2963364041","https://openalex.org/W2963702064","https://openalex.org/W2964135678","https://openalex.org/W2969889150","https://openalex.org/W2972667718","https://openalex.org/W2972700704","https://openalex.org/W2982999850","https://openalex.org/W3015419784","https://openalex.org/W3097969370","https://openalex.org/W3112061954","https://openalex.org/W3135547455","https://openalex.org/W3142644187","https://openalex.org/W3211326698","https://openalex.org/W4221147462","https://openalex.org/W4285235436","https://openalex.org/W4309564859","https://openalex.org/W4366493008","https://openalex.org/W6640963894","https://openalex.org/W6785164032","https://openalex.org/W6810360501"],"related_works":["https://openalex.org/W3013693939","https://openalex.org/W2159052453","https://openalex.org/W2566616303","https://openalex.org/W3131327266","https://openalex.org/W2734887215","https://openalex.org/W4297051394","https://openalex.org/W2752972570","https://openalex.org/W2145836866","https://openalex.org/W2803255133","https://openalex.org/W4220775285"],"abstract_inverted_index":{"Recent":[0],"studies":[1],"have":[2],"explored":[3],"the":[4,25,38,102,173,177,181,184,189,210,216,235,241],"development":[5],"of":[6,10,28,40,66,104,183,232],"empathetic":[7],"systems,":[8],"capable":[9],"engaging":[11],"in":[12,17,31,213],"human-like":[13],"communication":[14],"and":[15,53,58,86,132,165,176,188,215,240],"support":[16],"daily":[18],"life":[19],"tasks.":[20],"This":[21],"envision":[22],"primarily":[23],"requires":[24],"correct":[26],"classification":[27,88,103,227],"human":[29,84],"emotions":[30,114,175],"speech.":[32],"Nevertheless,":[33],"it":[34],"is":[35,122],"challenged":[36],"by":[37],"complexities":[39],"classifying":[41],"emotionally":[42,96,203],"similar":[43],"or":[44,56],"overlapped":[45],"audio":[46,117],"content":[47],"characteristics,":[48],"such":[49,146,160],"as":[50,147,161],"between":[51],"\u201cNeutral\u201d":[52],"\u201cSad\u201d":[54],"emotions,":[55,60,159],"\u201cAngry\u201d":[57],"\u201cHappy\u201d":[59,164],"representing":[61],"more":[62,202],"than":[63],"17.42":[64],"percent":[65],"misclassifications.":[67],"Furthermore,":[68],"existing":[69],"emotional":[70,78,105,110,144,211,225],"databases,":[71],"while":[72],"publicly":[73],"available,":[74],"often":[75],"contain":[76],"weakly":[77],"data,":[79],"posing":[80],"challenges":[81],"for":[82,101,113,222],"both":[83],"listeners":[85],"artificial":[87],"systems.":[89],"In":[90],"this":[91],"paper,":[92],"we":[93,191],"proposed":[94,121],"an":[95],"regularised":[97],"variational":[98,185],"auto-encoder":[99,186],"model":[100,155,187,244],"speech":[106,214,217,226],"audios":[107],"with":[108,115,134,172,196,228,238,245],"correlated":[109],"sounds,":[111,145],"particularly":[112],"overlapping":[116],"attributes.":[118],"The":[119,139,167,206],"dataset":[120],"a":[123,128,201,223,229],"mixed":[124],"dataset,":[125,131],"created":[126],"from":[127],"public":[129],"well-known":[130],"extended":[133],"real":[135,224],"added":[136],"customised":[137,140],"data.":[138],"data":[141,195],"are":[142,170,219],"distinctive":[143],"shouting,":[148],"crying,":[149],"giggling,":[150],"etc.":[151],"Our":[152],"emotion":[153],"classifier":[154],"can":[156],"classify":[157],"four":[158],"\u201cAngry\u201d,":[162],"\u201cSad\u201d,":[163],"\u201cNormal\u201d.":[166],"sounds":[168,212],"selected":[169,174],"associated":[171],"speaker":[178],"gender.":[179],"Throughout":[180],"training":[182],"classifier,":[190],"aligned":[192],"our":[193],"complex":[194],"prosodic":[197],"features,":[198],"to":[199],"obtain":[200],"expected":[204],"quality.":[205],"results":[207],"show":[208],"that":[209],"utterances,":[218],"strongly":[220],"complementary":[221],"test":[230],"accuracy":[231],"0.973,":[233],"improving":[234],"vanilla":[236],"VAE":[237],"0.881,":[239],"standard":[242],"CNN-based":[243],"0.652.":[246]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
