{"id":"https://openalex.org/W7154708193","doi":"https://doi.org/10.48550/arxiv.2604.14932","title":"WavAlign: Enhancing Intelligence and Expressiveness in Spoken Dialogue Models via Adaptive Hybrid Post-Training","display_name":"WavAlign: Enhancing Intelligence and Expressiveness in Spoken Dialogue Models via Adaptive Hybrid Post-Training","publication_year":2026,"publication_date":"2026-04-16","ids":{"openalex":"https://openalex.org/W7154708193","doi":"https://doi.org/10.48550/arxiv.2604.14932"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.14932","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14932","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.14932","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133884933","display_name":"Yifu Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Yifu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133909062","display_name":"Shengpeng Ji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ji, Shengpeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133890451","display_name":"Qian Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Qian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125996254","display_name":"Tianle Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Tianle","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073742206","display_name":"Yangzhuo Li","orcid":"https://orcid.org/0000-0003-4450-2849"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yangzhuo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133910708","display_name":"Ziqing Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Ziqing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133839208","display_name":"Wen J. Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Wen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133893364","display_name":"Jingyu Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Jingyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057219846","display_name":"Haoxiao Wang","orcid":"https://orcid.org/0009-0000-4252-4353"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Haoxiao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133882385","display_name":"Xueyi Pu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pu, Xueyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129682877","display_name":"Fan Zhuo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhuo, Fan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133916404","display_name":"Zhou Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Zhou","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5133884933"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.38179999589920044,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.38179999589920044,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2678999900817871,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.08869999647140503,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.6883000135421753},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.46389999985694885},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.445499986410141},{"id":"https://openalex.org/keywords/spoken-language","display_name":"Spoken language","score":0.4009999930858612},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.38260000944137573},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.37540000677108765}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7537000179290771},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.6883000135421753},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5390999913215637},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.46389999985694885},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.45669999718666077},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.445499986410141},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.4009999930858612},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.38260000944137573},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.37540000677108765},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.36809998750686646},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.3068000078201294},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.3037000000476837},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.2890999913215637},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.2696000039577484}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.14932","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14932","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.14932","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14932","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"End-to-end":[0],"spoken":[1,31,58,109,145],"dialogue":[2,32,59,146],"models":[3,33],"have":[4],"garnered":[5],"significant":[6],"attention":[7],"because":[8],"they":[9],"offer":[10],"a":[11,99],"higher":[12],"potential":[13],"ceiling":[14],"in":[15,46,155],"expressiveness":[16,27],"and":[17,26,76,119,148,151,158],"perceptual":[18],"ability":[19],"than":[20],"cascaded":[21],"systems.":[22],"However,":[23],"the":[24,40,71,95,116,141],"intelligence":[25],"of":[28,42,73],"current":[29],"open-source":[30],"often":[34],"remain":[35],"below":[36],"expectations.":[37],"Motivated":[38],"by":[39],"success":[41],"online":[43],"reinforcement":[44],"learning(RL)":[45],"other":[47],"domains,":[48],"one":[49],"might":[50],"attempt":[51],"to":[52,57,115,134],"directly":[53],"apply":[54],"preference":[55,83,113,137],"optimization":[56],"models,":[60],"yet":[61],"this":[62],"transfer":[63],"is":[64],"non-trivial.":[65],"We":[66,139],"analyze":[67],"these":[68],"obstacles":[69],"from":[70,131],"perspectives":[72],"reward":[74],"modeling":[75],"rollout":[77,132],"sampling,":[78],"focusing":[79],"on":[80,94],"how":[81],"sparse":[82],"supervision":[84],"interacts":[85],"with":[86],"dense":[87],"speech":[88,159],"generation":[89],"under":[90],"shared-parameter":[91],"updates.":[92],"Based":[93],"analysis,":[96],"we":[97],"propose":[98],"modality-aware":[100],"adaptive":[101],"post-training":[102],"recipe":[103],"that":[104],"makes":[105],"RL":[106],"practical":[107],"for":[108],"dialogue:":[110],"it":[111],"constrains":[112],"updates":[114],"semantic":[117,156],"channel":[118],"improves":[120],"acoustic":[121],"behavior":[122],"via":[123],"explicit":[124],"anchoring,":[125],"while":[126],"dynamically":[127],"regulating":[128],"their":[129],"mixture":[130],"statistics":[133],"avoid":[135],"unreliable":[136],"gradients.":[138],"evaluate":[140],"method":[142],"across":[143],"multiple":[144],"benchmarks":[147],"representative":[149],"architectures,":[150],"observe":[152],"consistent":[153],"improvements":[154],"quality":[157],"expressiveness.":[160]},"counts_by_year":[],"updated_date":"2026-04-18T06:05:20.339008","created_date":"2026-04-18T00:00:00"}
