{"id":"https://openalex.org/W4415433438","doi":"https://doi.org/10.21437/interspeech.2025-1041","title":"MATER: Multi-level Acoustic and Textual Emotion Representation for Interpretable Speech Emotion Recognition","display_name":"MATER: Multi-level Acoustic and Textual Emotion Representation for Interpretable Speech Emotion Recognition","publication_year":2025,"publication_date":"2025-08-17","ids":{"openalex":"https://openalex.org/W4415433438","doi":"https://doi.org/10.21437/interspeech.2025-1041"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2025-1041","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2025-1041","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2506.19887","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5002474451","display_name":"Hyo Jin Jon","orcid":"https://orcid.org/0009-0001-1580-5851"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hyo Jin Jon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059597557","display_name":"Longbin Jin","orcid":"https://orcid.org/0009-0009-8664-0124"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Longbin Jin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103445425","display_name":"Hyuntaek Jung","orcid":"https://orcid.org/0009-0002-9495-4069"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hyuntaek Jung","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052001877","display_name":"Hyunseo Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hyunseo Kim","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Donghun Min","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Donghun Min","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100649195","display_name":"Eun Yi Kim","orcid":"https://orcid.org/0000-0002-6944-5863"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Eun Yi Kim","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5002474451"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.0776,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.9054377,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"4673","last_page":"4677"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9035999774932861,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9035999774932861,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.6486999988555908},{"id":"https://openalex.org/keywords/categorical-variable","display_name":"Categorical variable","score":0.5863999724388123},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5444999933242798},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.48410001397132874},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4832000136375427},{"id":"https://openalex.org/keywords/valence","display_name":"Valence (chemistry)","score":0.4341000020503998},{"id":"https://openalex.org/keywords/emotional-valence","display_name":"Emotional valence","score":0.38769999146461487}],"concepts":[{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.6486999988555908},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6358000040054321},{"id":"https://openalex.org/C5274069","wikidata":"https://www.wikidata.org/wiki/Q2285707","display_name":"Categorical variable","level":2,"score":0.5863999724388123},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5722000002861023},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5444999933242798},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5297999978065491},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.48410001397132874},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4832000136375427},{"id":"https://openalex.org/C168900304","wikidata":"https://www.wikidata.org/wiki/Q171407","display_name":"Valence (chemistry)","level":2,"score":0.4341000020503998},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4221000075340271},{"id":"https://openalex.org/C3020774634","wikidata":"https://www.wikidata.org/wiki/Q3113318","display_name":"Emotional valence","level":3,"score":0.38769999146461487},{"id":"https://openalex.org/C206310091","wikidata":"https://www.wikidata.org/wiki/Q750859","display_name":"Emotion classification","level":2,"score":0.3709000051021576},{"id":"https://openalex.org/C2988148770","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion detection","level":3,"score":0.30889999866485596},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.30720001459121704},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.30559998750686646},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.304500013589859},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.2727999985218048},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2628999948501587}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.21437/interspeech.2025-1041","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2025-1041","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2025","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2506.19887","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.19887","pdf_url":"https://arxiv.org/pdf/2506.19887","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":null}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2506.19887","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.19887","pdf_url":"https://arxiv.org/pdf/2506.19887","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":null},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"This":[0],"paper":[1],"presents":[2],"our":[3],"contributions":[4],"to":[5,89],"the":[6,27,55],"Speech":[7],"Emotion":[8,41],"Recognition":[9],"in":[10,95,102,119],"Naturalistic":[11],"Conditions":[12],"(SERNC)":[13],"Challenge,":[14],"where":[15],"we":[16,37,83],"address":[17],"categorical":[18],"emotion":[19],"recognition":[20],"and":[21,34,51,58,65,79,110],"emotional":[22,97],"attribute":[23],"prediction.":[24],"To":[25],"handle":[26],"complexities":[28],"of":[29,108,114,126],"natural":[30],"speech,":[31],"including":[32],"intra-":[33],"inter-subject":[35],"variability,":[36],"propose":[38],"Multi-level":[39],"Acoustic-Textual":[40],"Representation":[42],"(MATER),":[43],"a":[44,106],"novel":[45],"hierarchical":[46],"framework":[47],"that":[48],"integrates":[49],"acoustic":[50,66],"textual":[52],"features":[53],"at":[54],"word,":[56],"utterance,":[57],"embedding":[59],"levels.":[60],"By":[61],"fusing":[62],"low-level":[63],"lexical":[64],"cues":[67],"with":[68,105,122],"high-level":[69],"contextualized":[70],"representations,":[71],"MATER":[72,99],"effectively":[73],"captures":[74],"both":[75,103],"fine-grained":[76],"prosodic":[77],"variations":[78],"semantic":[80],"nuances.":[81],"Additionally,":[82],"introduce":[84],"an":[85,111,123],"uncertainty-aware":[86],"ensemble":[87],"strategy":[88],"mitigate":[90],"annotator":[91],"inconsistencies,":[92],"improving":[93],"robustness":[94],"ambiguous":[96],"expressions.":[98],"ranks":[100],"fourth":[101],"tasks":[104],"Macro-F1":[107],"41.01%":[109],"average":[112],"CCC":[113,125],"0.5928,":[115],"securing":[116],"second":[117],"place":[118],"valence":[120],"prediction":[121],"impressive":[124],"0.6941.":[127]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-05-06T08:25:59.206177","created_date":"2025-10-23T00:00:00"}
