{"id":"https://openalex.org/W7127159185","doi":"https://doi.org/10.48550/arxiv.2601.22873","title":"EmoShift: Lightweight Activation Steering for Enhanced Emotion-Aware Speech Synthesis","display_name":"EmoShift: Lightweight Activation Steering for Enhanced Emotion-Aware Speech Synthesis","publication_year":2026,"publication_date":"2026-01-30","ids":{"openalex":"https://openalex.org/W7127159185","doi":"https://doi.org/10.48550/arxiv.2601.22873"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2601.22873","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124801408","display_name":"Li Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhou, Li","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124787543","display_name":"Hao Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Hao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124761725","display_name":"Junjie Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Junjie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124772690","display_name":"Tianrui Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Tianrui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124768279","display_name":"Haizhou Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Haizhou","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5124801408"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.566100001335144,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.566100001335144,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12488","display_name":"Mental Health via Writing","score":0.0868000015616417,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.07720000296831131,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.8662999868392944},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6399000287055969},{"id":"https://openalex.org/keywords/offset","display_name":"Offset (computer science)","score":0.6306999921798706},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.49729999899864197},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.4059000015258789},{"id":"https://openalex.org/keywords/expression","display_name":"Expression (computer science)","score":0.3995000123977661},{"id":"https://openalex.org/keywords/emotional-expression","display_name":"Emotional expression","score":0.3815999925136566},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.3625999987125397}],"concepts":[{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.8662999868392944},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7210000157356262},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6399000287055969},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.6306999921798706},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6281999945640564},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.49729999899864197},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.4059000015258789},{"id":"https://openalex.org/C90559484","wikidata":"https://www.wikidata.org/wiki/Q778379","display_name":"Expression (computer science)","level":2,"score":0.3995000123977661},{"id":"https://openalex.org/C143110190","wikidata":"https://www.wikidata.org/wiki/Q5373787","display_name":"Emotional expression","level":2,"score":0.3815999925136566},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36550000309944153},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3625999987125397},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.34599998593330383},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.3280999958515167},{"id":"https://openalex.org/C91682802","wikidata":"https://www.wikidata.org/wiki/Q620538","display_name":"Multidimensional scaling","level":2,"score":0.3255999982357025},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.32499998807907104},{"id":"https://openalex.org/C168900304","wikidata":"https://www.wikidata.org/wiki/Q171407","display_name":"Valence (chemistry)","level":2,"score":0.29980000853538513},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.298799991607666},{"id":"https://openalex.org/C195704467","wikidata":"https://www.wikidata.org/wiki/Q327968","display_name":"Facial expression","level":2,"score":0.298799991607666},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.28279998898506165},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.27730000019073486},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2703999876976013},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2678000032901764},{"id":"https://openalex.org/C43617652","wikidata":"https://www.wikidata.org/wiki/Q7575399","display_name":"Speech production","level":2,"score":0.25220000743865967}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2601.22873","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2601.22873","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.22873","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2601.22873","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Achieving":[0],"precise":[1],"and":[2,11,80,87,102,108,117,128],"controllable":[3,133],"emotional":[4,112,134],"expression":[5,84],"is":[6],"crucial":[7],"for":[8,66,132],"producing":[9],"natural":[10],"context-appropriate":[12],"speech":[13,137],"in":[14,70,106,136],"text-to-speech":[15],"(TTS)":[16],"synthesis.":[17,138],"However,":[18],"many":[19],"emotion-aware":[20],"TTS":[21],"systems,":[22],"including":[23],"large":[24],"language":[25],"model":[26,42],"(LLM)-based":[27],"designs,":[28],"rely":[29],"on":[30],"scaling":[31],"fixed":[32],"emotion":[33,69],"embeddings":[34],"or":[35],"external":[36],"guidance,":[37],"limiting":[38],"their":[39],"ability":[40],"to":[41,75],"emotion-specific":[43],"latent":[44,78],"characteristics.":[45],"To":[46],"address":[47],"this":[48],"gap,":[49],"we":[50],"present":[51],"EmoShift,":[52],"a":[53,58,63],"lightweight":[54],"activation-steering":[55],"framework":[56],"incorporating":[57],"EmoSteer":[59,125],"layer,":[60],"which":[61],"learns":[62],"steering":[64],"vector":[65],"each":[67],"target":[68],"the":[71,123],"output":[72],"embedding":[73],"space":[74],"capture":[76],"its":[77,130],"offset":[79],"maintain":[81],"stable,":[82],"appropriate":[83],"across":[85],"utterances":[86],"categories.":[88],"With":[89],"only":[90],"10M":[91],"trainable":[92],"parameters,less":[93],"than":[94],"1/30":[95],"of":[96],"full":[97],"fine-tuning,":[98],"EmoShift":[99],"outperforms":[100],"zero-shot":[101],"fully":[103],"fine-tuned":[104],"baselines":[105],"objective":[107],"subjective":[109],"evaluations,":[110],"enhancing":[111],"expressiveness":[113],"while":[114],"preserving":[115],"naturalness":[116],"speaker":[118],"similarity.":[119],"Further":[120],"analysis":[121],"confirms":[122],"proposed":[124],"layer's":[126],"effectiveness":[127],"reveals":[129],"potential":[131],"intensity":[135]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-03T00:00:00"}
