{"id":"https://openalex.org/W4406612562","doi":"https://doi.org/10.1109/smc54092.2024.10831974","title":"Multi-Granularity Temporal-Spectral Representation Learning for Speech Emotion Recognition","display_name":"Multi-Granularity Temporal-Spectral Representation Learning for Speech Emotion Recognition","publication_year":2024,"publication_date":"2024-10-06","ids":{"openalex":"https://openalex.org/W4406612562","doi":"https://doi.org/10.1109/smc54092.2024.10831974"},"language":"en","primary_location":{"id":"doi:10.1109/smc54092.2024.10831974","is_oa":false,"landing_page_url":"https://doi.org/10.1109/smc54092.2024.10831974","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113145350","display_name":"Zhichen Yuan","orcid":null},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhichen Yuan","raw_affiliation_strings":["School of Computer Science and Engineering, South China University of Technology,Guangdong Provincial Key Laboratory of AI Large Model and Intelligent Cognition,Guangzhou,China,510006"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, South China University of Technology,Guangdong Provincial Key Laboratory of AI Large Model and Intelligent Cognition,Guangzhou,China,510006","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100643265","display_name":"C. L. Philip Chen","orcid":"https://orcid.org/0000-0001-5451-7230"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"C.L. Philip Chen","raw_affiliation_strings":["School of Computer Science and Engineering, South China University of Technology,Guangdong Provincial Key Laboratory of AI Large Model and Intelligent Cognition,Guangzhou,China,510006"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, South China University of Technology,Guangdong Provincial Key Laboratory of AI Large Model and Intelligent Cognition,Guangzhou,China,510006","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027023224","display_name":"Shuzhen Li","orcid":"https://orcid.org/0000-0001-6847-6740"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuzhen Li","raw_affiliation_strings":["School of Computer Science and Engineering, South China University of Technology,Guangdong Provincial Key Laboratory of AI Large Model and Intelligent Cognition,Guangzhou,China,510006"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, South China University of Technology,Guangdong Provincial Key Laboratory of AI Large Model and Intelligent Cognition,Guangzhou,China,510006","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100378754","display_name":"Tong Zhang","orcid":"https://orcid.org/0000-0001-6212-4891"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tong Zhang","raw_affiliation_strings":["School of Computer Science and Engineering, South China University of Technology,Guangdong Provincial Key Laboratory of AI Large Model and Intelligent Cognition,Guangzhou,China,510006"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, South China University of Technology,Guangdong Provincial Key Laboratory of AI Large Model and Intelligent Cognition,Guangzhou,China,510006","institution_ids":["https://openalex.org/I90610280"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.26731676,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"469","last_page":"474"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9670000076293945,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9670000076293945,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9434000253677368,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9362999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/granularity","display_name":"Granularity","score":0.8184940814971924},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7633517980575562},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6332454681396484},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5788525342941284},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.49875712394714355},{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.48929259181022644},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.461064875125885},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.39164063334465027}],"concepts":[{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.8184940814971924},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7633517980575562},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6332454681396484},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5788525342941284},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.49875712394714355},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.48929259181022644},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.461064875125885},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.39164063334465027},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/smc54092.2024.10831974","is_oa":false,"landing_page_url":"https://doi.org/10.1109/smc54092.2024.10831974","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4000000059604645,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W2004804300","https://openalex.org/W2026356512","https://openalex.org/W2074788634","https://openalex.org/W2143549750","https://openalex.org/W2408520939","https://openalex.org/W2738226240","https://openalex.org/W2784665486","https://openalex.org/W2803098682","https://openalex.org/W2889507358","https://openalex.org/W3047624172","https://openalex.org/W3081192838","https://openalex.org/W3095435288","https://openalex.org/W3209389523","https://openalex.org/W4253289577","https://openalex.org/W4292980177","https://openalex.org/W4295957212","https://openalex.org/W4296523924","https://openalex.org/W4317380951","https://openalex.org/W4323569712","https://openalex.org/W4328007250","https://openalex.org/W4361994820","https://openalex.org/W4372269797","https://openalex.org/W4376526190","https://openalex.org/W4387846988","https://openalex.org/W4388705844","https://openalex.org/W4392908941"],"related_works":["https://openalex.org/W2931688134","https://openalex.org/W2377919138","https://openalex.org/W2378857091","https://openalex.org/W103652678","https://openalex.org/W4226090359","https://openalex.org/W2059697060","https://openalex.org/W936373746","https://openalex.org/W2975817033","https://openalex.org/W3126677997","https://openalex.org/W1610857240"],"abstract_inverted_index":{"Speech":[0],"emotion":[1,66,172],"recognition":[2,67],"(SER)":[3],"captures":[4],"emotional":[5,13,30],"information":[6,31,44],"from":[7,32,81],"speech":[8,65,171],"signals":[9],"to":[10,83,95,123,125],"recognize":[11],"users'":[12],"states,":[14],"which":[15],"plays":[16],"a":[17,57,113],"crucial":[18],"role":[19],"in":[20,75,162,170],"conversational":[21],"human-computer":[22],"interaction.":[23],"Most":[24],"SER":[25],"researches":[26],"focus":[27],"on":[28,149],"ex-ploiting":[29],"global":[33],"temporal":[34,73,107,139],"or":[35],"spectral":[36],"features,":[37],"but":[38],"it":[39,124],"may":[40],"neglect":[41],"detailed":[42,86],"emotion-related":[43,97],"such":[45],"as":[46],"phonemes":[47],"and":[48,78,100,118,141,153,160],"syllables.":[49],"To":[50],"address":[51],"this":[52,54],"problem,":[53],"paper":[55],"proposes":[56],"multi-granularity":[58,138],"temporal-spectral":[59,121,134],"representation":[60,135],"learning":[61,116,136],"(MG-TSRL)":[62],"network":[63],"for":[64],"tasks.":[68],"Specifically,":[69],"MG-TSRL":[70,110,130,169],"extracts":[71],"different":[72],"features":[74,104,122,140],"phonetic,":[76],"syllabic,":[77],"sentential":[79],"granular-ity":[80],"spectrograms":[82],"retain":[84],"more":[85,127],"emotional-related":[87],"information.":[88],"It":[89],"then":[90],"designs":[91],"multilayer":[92,142],"emotion-aware":[93],"units":[94],"capture":[96],"frequency":[98,143],"patterns":[99],"obtain":[101,126],"deep":[102,120],"spectrum":[103],"at":[105],"each":[106],"granularity":[108],"feature.":[109],"further":[111],"introduces":[112],"fast":[114],"broad":[115],"system":[117],"feeds":[119],"accurate":[128],"emotions.":[129],"gradually":[131],"achieves":[132],"effective":[133],"through":[137],"pattern":[144],"learning.":[145],"The":[146],"state-of-the-art":[147],"results":[148],"the":[150,166],"CASIA,":[151],"RAVDESS,":[152],"SAVEE":[154],"datasets":[155],"are":[156],"respectively":[157],"95.17%,":[158],"92.78%,":[159],"87.50%":[161],"unweighted":[163],"accuracy,":[164],"demonstrating":[165],"effectiveness":[167],"of":[168],"recognition.":[173]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
