{"id":"https://openalex.org/W7158788335","doi":"https://doi.org/10.48550/arxiv.2604.26347","title":"The False Resonance: A Critical Examination of Emotion Embedding Similarity for Speech Generation Evaluation","display_name":"The False Resonance: A Critical Examination of Emotion Embedding Similarity for Speech Generation Evaluation","publication_year":2026,"publication_date":"2026-04-29","ids":{"openalex":"https://openalex.org/W7158788335","doi":"https://doi.org/10.48550/arxiv.2604.26347"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.26347","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26347","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.26347","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101943312","display_name":"Yi-Ting Tsai","orcid":"https://orcid.org/0000-0001-6098-8944"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tsai, Yun-Shao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134918775","display_name":"Yi-Cheng Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Yi-Cheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089273852","display_name":"Huang-Cheng Chou","orcid":"https://orcid.org/0000-0003-2125-5689"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chou, Huang-Cheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112788374","display_name":"T. C. Hsu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hsu, Tzu-Wen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104254656","display_name":"Yun-Man Hsu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hsu, Yun-Man","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134885018","display_name":"Chun Wei Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Chun Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134882774","display_name":"Shrikanth Narayanan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Narayanan, Shrikanth","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134888648","display_name":"Hung-yi Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Hung-yi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.6198999881744385,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.6198999881744385,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.12080000340938568,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12488","display_name":"Mental Health via Writing","score":0.03880000114440918,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/prosody","display_name":"Prosody","score":0.7721999883651733},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.682699978351593},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.6747000217437744},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.42480000853538513},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.4187999963760376},{"id":"https://openalex.org/keywords/gesture","display_name":"Gesture","score":0.41449999809265137},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.4018999934196472},{"id":"https://openalex.org/keywords/cosine-similarity","display_name":"Cosine similarity","score":0.3919999897480011},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.34150001406669617}],"concepts":[{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.7721999883651733},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.682699978351593},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.6747000217437744},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5837000012397766},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5561000108718872},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.42480000853538513},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.4187999963760376},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41769999265670776},{"id":"https://openalex.org/C207347870","wikidata":"https://www.wikidata.org/wiki/Q371174","display_name":"Gesture","level":2,"score":0.41449999809265137},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.4018999934196472},{"id":"https://openalex.org/C2780762811","wikidata":"https://www.wikidata.org/wiki/Q1784941","display_name":"Cosine similarity","level":3,"score":0.3919999897480011},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3874000012874603},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.34150001406669617},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.33880001306533813},{"id":"https://openalex.org/C143110190","wikidata":"https://www.wikidata.org/wiki/Q5373787","display_name":"Emotional expression","level":2,"score":0.33709999918937683},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.3176000118255615},{"id":"https://openalex.org/C7863114","wikidata":"https://www.wikidata.org/wiki/Q192627","display_name":"Mimicry","level":2,"score":0.2980000078678131},{"id":"https://openalex.org/C206310091","wikidata":"https://www.wikidata.org/wiki/Q750859","display_name":"Emotion classification","level":2,"score":0.2953000068664551},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.2842000126838684},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.2824999988079071},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.28189998865127563},{"id":"https://openalex.org/C145633318","wikidata":"https://www.wikidata.org/wiki/Q207125","display_name":"Nonverbal communication","level":2,"score":0.2736999988555908},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.26829999685287476},{"id":"https://openalex.org/C195704467","wikidata":"https://www.wikidata.org/wiki/Q327968","display_name":"Facial expression","level":2,"score":0.26820001006126404},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.2565000057220459},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.25619998574256897},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.25600001215934753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.26347","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26347","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.26347","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.26347","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.7745906114578247,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Objective":[0],"metrics":[1],"for":[2,7,78],"emotional":[3,18,91,113],"expressiveness":[4],"are":[5,76],"vital":[6],"speech":[8],"generation,":[9],"particularly":[10],"in":[11],"expressive":[12],"synthesis":[13],"and":[14,33,54,65,86],"voice":[15],"conversion":[16],"requiring":[17],"prosody":[19],"transfer.":[20],"To":[21],"quantify":[22],"this,":[23],"the":[24,97],"field":[25],"widely":[26],"relies":[27],"on":[28],"emotion":[29],"similarity":[30,40,80],"between":[31],"reference":[32],"generated":[34],"samples.":[35],"This":[36,103],"approach":[37],"computes":[38],"cosine":[39],"of":[41],"embeddings":[42],"from":[43],"encoders":[44],"like":[45],"emotion2vec,":[46],"assuming":[47],"they":[48],"capture":[49],"affective":[50],"cues":[51],"despite":[52],"linguistic":[53,85],"speaker":[55,87],"variations.":[56],"We":[57],"challenge":[58],"this":[59],"assumption":[60],"through":[61],"controlled":[62],"adversarial":[63],"tasks":[64],"human":[66,101],"alignment":[67],"tests.":[68],"Despite":[69],"high":[70],"classification":[71],"accuracy,":[72],"these":[73],"latent":[74],"spaces":[75],"unsuitable":[77],"zero-shot":[79],"evaluation.":[81],"Representational":[82],"limitations":[83],"cause":[84],"interference":[88],"to":[89],"overshadow":[90],"features,":[92],"degrading":[93],"discriminative":[94],"ability.":[95],"Consequently,":[96],"metric":[98],"misaligns":[99],"with":[100],"perception.":[102],"acoustic":[104,109],"vulnerability":[105],"reveals":[106],"it":[107],"rewards":[108],"mimicry":[110],"over":[111],"genuine":[112],"synthesis.":[114]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-01T00:00:00"}
