{"id":"https://openalex.org/W2898004252","doi":"https://doi.org/10.1109/icme.2018.8486564","title":"Auditory-Inspired End-to-End Speech Emotion Recognition Using 3D Convolutional Recurrent Neural Networks Based on Spectral-Temporal Representation","display_name":"Auditory-Inspired End-to-End Speech Emotion Recognition Using 3D Convolutional Recurrent Neural Networks Based on Spectral-Temporal Representation","publication_year":2018,"publication_date":"2018-07-01","ids":{"openalex":"https://openalex.org/W2898004252","doi":"https://doi.org/10.1109/icme.2018.8486564","mag":"2898004252"},"language":"en","primary_location":{"id":"doi:10.1109/icme.2018.8486564","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme.2018.8486564","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2018 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5009370806","display_name":"Zhichao Peng","orcid":"https://orcid.org/0000-0003-1020-7796"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhichao Peng","raw_affiliation_strings":["School of Computer Science and Technology, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Tianjin University, Tianjin, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089591444","display_name":"Zhi Zhu","orcid":"https://orcid.org/0000-0002-1525-9395"},"institutions":[{"id":"https://openalex.org/I177738480","display_name":"Japan Advanced Institute of Science and Technology","ror":"https://ror.org/03frj4r98","country_code":"JP","type":"education","lineage":["https://openalex.org/I177738480"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Zhi Zhu","raw_affiliation_strings":["Graduate School of Advanced Science and Technology, Japan Advanced Institute of Science and Technology, Ishikawa, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Advanced Science and Technology, Japan Advanced Institute of Science and Technology, Ishikawa, Japan","institution_ids":["https://openalex.org/I177738480"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014199725","display_name":"Masashi Unoki","orcid":"https://orcid.org/0000-0002-6605-2052"},"institutions":[{"id":"https://openalex.org/I177738480","display_name":"Japan Advanced Institute of Science and Technology","ror":"https://ror.org/03frj4r98","country_code":"JP","type":"education","lineage":["https://openalex.org/I177738480"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Masashi Unoki","raw_affiliation_strings":["Graduate School of Advanced Science and Technology, Japan Advanced Institute of Science and Technology, Ishikawa, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Advanced Science and Technology, Japan Advanced Institute of Science and Technology, Ishikawa, Japan","institution_ids":["https://openalex.org/I177738480"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017251198","display_name":"Jianwu Dang","orcid":"https://orcid.org/0000-0002-9237-4821"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jianwu Dang","raw_affiliation_strings":["School of Computer Science and Technology, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Tianjin University, Tianjin, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5055059119","display_name":"Masato Akagi","orcid":"https://orcid.org/0000-0003-2450-6754"},"institutions":[{"id":"https://openalex.org/I177738480","display_name":"Japan Advanced Institute of Science and Technology","ror":"https://ror.org/03frj4r98","country_code":"JP","type":"education","lineage":["https://openalex.org/I177738480"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Masato Akagi","raw_affiliation_strings":["Graduate School of Advanced Science and Technology, Japan Advanced Institute of Science and Technology, Ishikawa, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Advanced Science and Technology, Japan Advanced Institute of Science and Technology, Ishikawa, Japan","institution_ids":["https://openalex.org/I177738480"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5009370806"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.1606,"has_fulltext":false,"cited_by_count":12,"citation_normalized_percentile":{"value":0.80163892,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7275573015213013},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7050065398216248},{"id":"https://openalex.org/keywords/psychoacoustics","display_name":"Psychoacoustics","score":0.5370150804519653},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.5231296420097351},{"id":"https://openalex.org/keywords/modulation","display_name":"Modulation (music)","score":0.5056827068328857},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4565746784210205},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4327922761440277},{"id":"https://openalex.org/keywords/recurrent-neural-network","display_name":"Recurrent neural network","score":0.42142254114151},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.29929137229919434},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.2558911144733429},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.0949992835521698},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.07649335265159607},{"id":"https://openalex.org/keywords/neuroscience","display_name":"Neuroscience","score":0.07003960013389587}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7275573015213013},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7050065398216248},{"id":"https://openalex.org/C9940772","wikidata":"https://www.wikidata.org/wiki/Q557399","display_name":"Psychoacoustics","level":3,"score":0.5370150804519653},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.5231296420097351},{"id":"https://openalex.org/C123079801","wikidata":"https://www.wikidata.org/wiki/Q750240","display_name":"Modulation (music)","level":2,"score":0.5056827068328857},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4565746784210205},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4327922761440277},{"id":"https://openalex.org/C147168706","wikidata":"https://www.wikidata.org/wiki/Q1457734","display_name":"Recurrent neural network","level":3,"score":0.42142254114151},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.29929137229919434},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.2558911144733429},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.0949992835521698},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.07649335265159607},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.07003960013389587},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/icme.2018.8486564","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme.2018.8486564","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2018 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},{"id":"pmh:oai:dspace.jaist.ac.jp:10119/15481","is_oa":false,"landing_page_url":"http://hdl.handle.net/10119/15481","pdf_url":null,"source":{"id":"https://openalex.org/S4406922663","display_name":"JAIST Repository","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":""}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320334764","display_name":"Japan Society for the Promotion of Science","ror":"https://ror.org/00hhkn466"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1480485976","https://openalex.org/W1538131130","https://openalex.org/W1966574198","https://openalex.org/W1983904091","https://openalex.org/W2042401599","https://openalex.org/W2046331056","https://openalex.org/W2054139811","https://openalex.org/W2060539877","https://openalex.org/W2061068689","https://openalex.org/W2062663442","https://openalex.org/W2064641533","https://openalex.org/W2064675550","https://openalex.org/W2076502758","https://openalex.org/W2087618018","https://openalex.org/W2146334809","https://openalex.org/W2157184970","https://openalex.org/W2158239695","https://openalex.org/W2232901134","https://openalex.org/W2295001676","https://openalex.org/W2399733683","https://openalex.org/W2514017951","https://openalex.org/W2578895956","https://openalex.org/W2583743457","https://openalex.org/W2962736520","https://openalex.org/W2963131709","https://openalex.org/W2963467407","https://openalex.org/W6632100814","https://openalex.org/W6697498398","https://openalex.org/W6725773563"],"related_works":["https://openalex.org/W2748454020","https://openalex.org/W3119610945","https://openalex.org/W3181746755","https://openalex.org/W2902723393","https://openalex.org/W3016958897","https://openalex.org/W4283379348","https://openalex.org/W2963891724","https://openalex.org/W2767651786","https://openalex.org/W2912288872","https://openalex.org/W564581980"],"abstract_inverted_index":{"The":[0,114,138,147],"human":[1,29,39],"auditory":[2,30,40],"system":[3,41,82],"has":[4,19],"far":[5],"superior":[6],"emotion":[7,14,23,80,136],"recognition":[8,15,24,81,157],"abilities":[9],"compared":[10,159],"with":[11],"recent":[12],"speech":[13,43],"systems,":[16],"so":[17],"research":[18],"focused":[20],"on":[21,95,143],"designing":[22],"systems":[25],"by":[26],"mimicking":[27],"the":[28,38,67,74,110,126,144,156,163],"system.":[31],"Psychoacoustic":[32],"and":[33,47,51,69,125],"physiological":[34],"studies":[35],"indicate":[36],"that":[37,150,161],"decomposes":[42],"signals":[44],"into":[45],"acoustic":[46],"modulation":[48,55,64,97,100],"frequency":[49],"components,":[50],"further":[52],"extracts":[53],"temporal":[54,63,70,96],"cues.":[56,98],"Speech":[57],"emotional":[58],"states":[59],"are":[60],"perceived":[61],"from":[62],"cues":[65,101],"using":[66,87],"spectral":[68],"receptive":[71],"field":[72],"of":[73,112,162],"neuron.":[75],"This":[76],"paper":[77],"proposes":[78],"an":[79,84],"in":[83],"end-to-end":[85],"manner":[86],"three-dimensional":[88],"convolutional":[89,115],"recurrent":[90,127],"neural":[91],"networks":[92],"(3D-CRNNs)":[93],"based":[94],"Temporal":[99],"contain":[102],"four-dimensional":[103],"spectral-temporal":[104],"(ST)":[105],"integration":[106],"representations":[107],"directly":[108],"as":[109],"input":[111],"3D-CRNNs.":[113],"layer":[116,128],"is":[117,129],"used":[118,130],"to":[119,131,160],"extract":[120,132],"high-level":[121],"multiscale":[122],"ST":[123],"representations,":[124],"long-term":[133],"dependency":[134],"for":[135],"recognition.":[137],"proposed":[139,152],"method":[140,153],"was":[141],"verified":[142],"IEMOCAP":[145],"database.":[146],"results":[148],"show":[149],"our":[151],"can":[154],"exceed":[155],"accuracy":[158],"state-of-the-art":[164],"systems.":[165]},"counts_by_year":[{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":3},{"year":2020,"cited_by_count":2},{"year":2019,"cited_by_count":2}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
