{"id":"https://openalex.org/W7154214126","doi":"https://doi.org/10.48550/arxiv.2604.11424","title":"Bridging What the Model Thinks and How It Speaks: Self-Aware Speech Language Models for Expressive Speech Generation","display_name":"Bridging What the Model Thinks and How It Speaks: Self-Aware Speech Language Models for Expressive Speech Generation","publication_year":2026,"publication_date":"2026-04-13","ids":{"openalex":"https://openalex.org/W7154214126","doi":"https://doi.org/10.48550/arxiv.2604.11424"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.11424","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11424","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.11424","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133572316","display_name":"Kuang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Kuang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119876232","display_name":"Lai Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Lai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133571706","display_name":"Qibing Bai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Qibing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128512014","display_name":"Ping Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Ping","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111361888","display_name":"Wenkai Fang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fang, Wenkai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133597821","display_name":"Feng Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Feng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133618565","display_name":"Zhongjie Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Zhongjie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133604683","display_name":"Jun Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Jun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133567236","display_name":"Yannan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yannan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133557316","display_name":"Haizhou Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Haizhou","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5133572316"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.11550000309944153,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.11550000309944153,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.10980000346899033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12128","display_name":"AI in Service Interactions","score":0.093299999833107,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.8055999875068665},{"id":"https://openalex.org/keywords/realization","display_name":"Realization (probability)","score":0.588699996471405},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5728999972343445},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4821999967098236},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.42579999566078186},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4068000018596649}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.805899977684021},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.8055999875068665},{"id":"https://openalex.org/C2781089630","wikidata":"https://www.wikidata.org/wiki/Q21856745","display_name":"Realization (probability)","level":2,"score":0.588699996471405},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5728999972343445},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4821999967098236},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.478300005197525},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4528999924659729},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.42579999566078186},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4068000018596649},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3707999885082245},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33649998903274536},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.2939999997615814},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.26649999618530273},{"id":"https://openalex.org/C2984865316","wikidata":"https://www.wikidata.org/wiki/Q25481968","display_name":"Speech act","level":2,"score":0.2621999979019165},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.11424","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11424","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.11424","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11424","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.4870712459087372,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Speech":[0,82],"Language":[1,83],"Models":[2],"(SLMs)":[3],"exhibit":[4],"strong":[5],"semantic":[6,29],"understanding,":[7],"yet":[8],"their":[9],"generated":[10],"speech":[11,138,180],"often":[12],"sounds":[13],"flat":[14],"and":[15,58,101,148,162,190],"fails":[16],"to":[17,37,48,126,146,160],"convey":[18],"expressive":[19,56,135,168,179],"intent,":[20,136],"undermining":[21],"user":[22],"engagement.":[23],"We":[24,33],"term":[25],"this":[26,35,109],"mismatch":[27],"the":[28,50,87,90,128,143,154,201],"understanding-acoustic":[30],"realization":[31,165],"gap.":[32],"attribute":[34],"gap":[36,110],"two":[38,112],"key":[39],"deficiencies:":[40],"(1)":[41,115],"intent":[42,53,169],"transmission":[43],"failure,":[44],"where":[45,62],"SLMs":[46],"fail":[47],"provide":[49],"stable":[51],"utterance-level":[52],"needed":[54],"for":[55],"delivery;":[57],"(2)":[59,149],"realization-unaware":[60],"training,":[61],"no":[63],"feedback":[64],"signal":[65],"verifies":[66],"whether":[67],"acoustic":[68,164],"outputs":[69],"faithfully":[70],"reflect":[71],"intended":[72,167],"expression.":[73],"To":[74],"address":[75],"these":[76],"issues,":[77],"we":[78],"propose":[79],"SA-SLM":[80,107,185],"(Self-Aware":[81],"Model),":[84],"built":[85],"on":[86,174,200],"principle":[88],"that":[89],"model":[91,144,155],"should":[92],"be":[93],"aware":[94,140],"of":[95,141,178,195],"what":[96,142],"it":[97,103],"thinks":[98],"during":[99,105],"generation":[100,139],"how":[102],"speaks":[104],"training.":[106],"addresses":[108],"through":[111],"core":[113],"contributions:":[114],"Intent-Aware":[116],"Bridging,":[117],"which":[118,152],"uses":[119],"a":[120],"Variational":[121],"Information":[122],"Bottleneck":[123],"(VIB)":[124],"objective":[125],"translate":[127],"model's":[129],"internal":[130],"semantics":[131],"into":[132],"temporally":[133],"smooth":[134],"making":[137],"intends":[145],"express;":[147],"Realization-Aware":[150],"Alignment,":[151],"repurposes":[153],"as":[156],"its":[157],"own":[158],"critic":[159],"verify":[161],"align":[163],"with":[166],"via":[170],"rubric-based":[171],"feedback.":[172],"Trained":[173],"only":[175],"800":[176],"hours":[177],"data,":[181],"our":[182],"3B":[183],"parameter":[184],"surpasses":[186],"all":[187],"open-source":[188],"baselines":[189],"comes":[191],"within":[192],"0.08":[193],"points":[194],"GPT-4o-Audio":[196],"in":[197],"overall":[198],"expressiveness":[199],"EchoMind":[202],"benchmark.":[203]},"counts_by_year":[],"updated_date":"2026-04-15T06:04:33.058270","created_date":"2026-04-15T00:00:00"}
