{"id":"https://openalex.org/W4403421225","doi":"https://doi.org/10.1109/taffc.2024.3481253","title":"A Residual Multi-Scale Convolutional Neural Network With Transformers for Speech Emotion Recognition","display_name":"A Residual Multi-Scale Convolutional Neural Network With Transformers for Speech Emotion Recognition","publication_year":2024,"publication_date":"2024-10-15","ids":{"openalex":"https://openalex.org/W4403421225","doi":"https://doi.org/10.1109/taffc.2024.3481253"},"language":"en","primary_location":{"id":"doi:10.1109/taffc.2024.3481253","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taffc.2024.3481253","pdf_url":null,"source":{"id":"https://openalex.org/S104780363","display_name":"IEEE Transactions on Affective Computing","issn_l":"1949-3045","issn":["1949-3045","2371-9850"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Affective Computing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101742217","display_name":"Tianhao Yan","orcid":"https://orcid.org/0000-0003-1851-6075"},"institutions":[{"id":"https://openalex.org/I4210123185","display_name":"Zhejiang Lab","ror":"https://ror.org/02m2h7991","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210123185"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Tianhao Yan","raw_affiliation_strings":["Research Center for Data Hub and Security, Zhejiang Lab, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-1851-6075","affiliations":[{"raw_affiliation_string":"Research Center for Data Hub and Security, Zhejiang Lab, Hangzhou, China","institution_ids":["https://openalex.org/I4210123185"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074442283","display_name":"Hao Meng","orcid":"https://orcid.org/0000-0003-3586-9286"},"institutions":[{"id":"https://openalex.org/I151727225","display_name":"Harbin Engineering University","ror":"https://ror.org/03x80pn82","country_code":"CN","type":"education","lineage":["https://openalex.org/I151727225"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Meng","raw_affiliation_strings":["College of Intelligent Systems Science and Engineering, Harbin Engineering University, Harbin, China"],"raw_orcid":"https://orcid.org/0000-0003-3586-9286","affiliations":[{"raw_affiliation_string":"College of Intelligent Systems Science and Engineering, Harbin Engineering University, Harbin, China","institution_ids":["https://openalex.org/I151727225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073718976","display_name":"Emilia Parada\u2010Cabaleiro","orcid":"https://orcid.org/0000-0003-1843-3632"},"institutions":[{"id":"https://openalex.org/I121883995","display_name":"Johannes Kepler University of Linz","ror":"https://ror.org/052r2xn60","country_code":"AT","type":"education","lineage":["https://openalex.org/I121883995"]}],"countries":["AT"],"is_corresponding":false,"raw_author_name":"Emilia Parada-Cabaleiro","raw_affiliation_strings":["Institute of Computational Perception, Johannes Kepler University Linz, Linz, Austria"],"raw_orcid":"https://orcid.org/0000-0003-1843-3632","affiliations":[{"raw_affiliation_string":"Institute of Computational Perception, Johannes Kepler University Linz, Linz, Austria","institution_ids":["https://openalex.org/I121883995"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112613657","display_name":"Jianhua Tao","orcid":"https://orcid.org/0000-0002-9344-6428"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianhua Tao","raw_affiliation_strings":["Department of Automation, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-9344-6428","affiliations":[{"raw_affiliation_string":"Department of Automation, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021697903","display_name":"Taihao Li","orcid":"https://orcid.org/0000-0003-3279-7125"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Taihao Li","raw_affiliation_strings":["Hangzhou Institute for Advanced Study, UCAS, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-3279-7125","affiliations":[{"raw_affiliation_string":"Hangzhou Institute for Advanced Study, UCAS, Hangzhou, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043060302","display_name":"Bj\u00f6rn W. Schuller","orcid":"https://orcid.org/0000-0002-6478-8699"},"institutions":[{"id":"https://openalex.org/I179225836","display_name":"University of Augsburg","ror":"https://ror.org/03p14d497","country_code":"DE","type":"education","lineage":["https://openalex.org/I179225836"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Bj\u00f6rn W. Schuller","raw_affiliation_strings":["Embedded Intelligence for Health Care &#x0026; Wellbeing, University of Augsburg, Augsburg, Germany","Chair of Embedded Intelligence for Health Care &amp; Wellbeing, University of Augsburg, Augsburg, Germany"],"raw_orcid":"https://orcid.org/0000-0002-6478-8699","affiliations":[{"raw_affiliation_string":"Embedded Intelligence for Health Care &#x0026; Wellbeing, University of Augsburg, Augsburg, Germany","institution_ids":["https://openalex.org/I179225836"]},{"raw_affiliation_string":"Chair of Embedded Intelligence for Health Care &amp; Wellbeing, University of Augsburg, Augsburg, Germany","institution_ids":["https://openalex.org/I179225836"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5101742217"],"corresponding_institution_ids":["https://openalex.org/I4210123185"],"apc_list":null,"apc_paid":null,"fwci":0.9865,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.75958225,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":"16","issue":"2","first_page":"915","last_page":"932"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.8532999753952026,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.8532999753952026,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8147000074386597,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.7330999970436096,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.7265481948852539},{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.6634204387664795},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.6561654806137085},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6538548469543457},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6106425523757935},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4828675091266632},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4791400134563446},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4740821123123169},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.4198024272918701},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3369717001914978},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.16157278418540955},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.05318853259086609},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.05078527331352234}],"concepts":[{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.7265481948852539},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.6634204387664795},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.6561654806137085},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6538548469543457},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6106425523757935},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4828675091266632},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4791400134563446},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4740821123123169},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.4198024272918701},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3369717001914978},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.16157278418540955},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.05318853259086609},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.05078527331352234},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/taffc.2024.3481253","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taffc.2024.3481253","pdf_url":null,"source":{"id":"https://openalex.org/S104780363","display_name":"IEEE Transactions on Affective Computing","issn_l":"1949-3045","issn":["1949-3045","2371-9850"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Affective Computing","raw_type":"journal-article"},{"id":"pmh:oai:mediatum.ub.tum.de:node/1795819","is_oa":false,"landing_page_url":"https://mediatum.ub.tum.de/1795819","pdf_url":null,"source":{"id":"https://openalex.org/S4377196330","display_name":"mediaTUM  (Technical University of Munich)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I62916508","host_organization_name":"Technical University of Munich","host_organization_lineage":["https://openalex.org/I62916508"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":60,"referenced_works":["https://openalex.org/W2079735306","https://openalex.org/W2085662862","https://openalex.org/W2146334809","https://openalex.org/W2293040502","https://openalex.org/W2342475039","https://openalex.org/W2525412388","https://openalex.org/W2556418146","https://openalex.org/W2742542661","https://openalex.org/W2806051338","https://openalex.org/W2885005742","https://openalex.org/W2889717020","https://openalex.org/W2913340405","https://openalex.org/W2936774411","https://openalex.org/W2963447013","https://openalex.org/W2980520956","https://openalex.org/W3086923691","https://openalex.org/W3091199198","https://openalex.org/W3092788858","https://openalex.org/W3094550259","https://openalex.org/W3096609285","https://openalex.org/W3097352614","https://openalex.org/W3107389224","https://openalex.org/W3114925916","https://openalex.org/W3126625480","https://openalex.org/W3129001924","https://openalex.org/W3137890092","https://openalex.org/W3138516171","https://openalex.org/W3159124063","https://openalex.org/W3161659450","https://openalex.org/W3162811262","https://openalex.org/W3163091219","https://openalex.org/W3164046946","https://openalex.org/W3164582967","https://openalex.org/W3184672723","https://openalex.org/W3196974791","https://openalex.org/W3199294604","https://openalex.org/W3205148967","https://openalex.org/W3207455498","https://openalex.org/W3209072429","https://openalex.org/W3210971248","https://openalex.org/W3215155711","https://openalex.org/W4200502037","https://openalex.org/W4205781455","https://openalex.org/W4213374241","https://openalex.org/W4221016305","https://openalex.org/W4221089191","https://openalex.org/W4223460604","https://openalex.org/W4226442948","https://openalex.org/W4229071437","https://openalex.org/W4254718357","https://openalex.org/W4285093204","https://openalex.org/W4312209192","https://openalex.org/W4318580191","https://openalex.org/W4361994820","https://openalex.org/W4385245566","https://openalex.org/W6749029207","https://openalex.org/W6772230580","https://openalex.org/W6776231833","https://openalex.org/W6780218876","https://openalex.org/W7045509596"],"related_works":["https://openalex.org/W2560215812","https://openalex.org/W2949601986","https://openalex.org/W4293226380","https://openalex.org/W2788972299","https://openalex.org/W2498789492","https://openalex.org/W2521347458","https://openalex.org/W2729981612","https://openalex.org/W4233449973","https://openalex.org/W3126677997","https://openalex.org/W1610857240"],"abstract_inverted_index":{"The":[0,149],"great":[1],"variety":[2],"of":[3,33,92,151,154,166,174],"human":[4],"emotional":[5,65],"expression":[6],"as":[7,9,54],"well":[8,58],"the":[10,13,31,45,68,72,81,89,96,124,139,145,152,158,162,175,179],"differences":[11],"in":[12,41,71,95,127,142,178],"ways":[14],"they":[15],"perceive":[16],"and":[17,27,78,131,164],"annotate":[18],"them":[19],"make":[20],"Speech":[21],"Emotion":[22],"Recognition":[23],"(SER)":[24],"an":[25,108,172],"ambiguous":[26],"challenging":[28],"task.":[29],"With":[30],"development":[32],"deep":[34],"learning,":[35],"long-term":[36],"progress":[37],"has":[38],"been":[39],"made":[40],"SER":[42],"systems.":[43],"However,":[44],"existing":[46],"convolutional":[47],"neural":[48],"networks":[49],"present":[50],"certain":[51],"limitations,":[52,105],"such":[53],"their":[55],"inability":[56],"to":[57,102,121],"capture":[59],"global":[60],"features,":[61],"which":[62,85],"contain":[63],"important":[64],"information.":[66],"Moreover,":[67],"position":[69,90],"encoding":[70],"Transformer":[73,117],"structure":[74],"is":[75],"relatively":[76],"fixed":[77],"only":[79],"encodes":[80],"time":[82],"domain":[83,98],"dimension,":[84],"cannot":[86],"effectively":[87],"obtain":[88],"information":[91],"discriminative":[93],"features":[94,130],"frequency":[97],"dimension.":[99],"In":[100],"order":[101],"overtake":[103],"these":[104],"we":[106,137],"propose":[107],"end-to-end":[109],"Residual":[110],"Multi-Scale":[111],"Convolutional":[112],"Neural":[113],"Networks":[114],"(RMSCNN)":[115],"with":[116,144],"model":[118],"network.":[119],"Simultaneously,":[120],"further":[122],"validate":[123],"effectivenessof":[125],"RMSCNN":[126],"extracting":[128],"multi-scale":[129],"delivering":[132],"pertinent":[133],"emotion":[134],"localization":[135],"data,":[136],"developed":[138],"RMSC_down":[140],"network":[141],"conjunction":[143],"Wav2Vec":[146],"2.0":[147],"model.":[148],"results":[150],"prediction":[153],"Arousal,":[155],"Valenceand":[156],"Dominanceon":[157],"popular":[159],"corpora":[160],"demonstrate":[161],"superiority":[163],"robustness":[165],"our":[167],"approach":[168],"for":[169],"SER,":[170],"showing":[171],"improvement":[173],"recognition":[176],"accuracy":[177],"public":[180],"dataset":[181],"MSP-Podcast":[182],"1.9":[183],"version.":[184]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2}],"updated_date":"2025-12-21T01:58:51.020947","created_date":"2025-10-10T00:00:00"}
