{"id":"https://openalex.org/W7106654555","doi":"https://doi.org/10.48550/arxiv.2511.17555","title":"Speech Recognition Model Improves Text-to-Speech Synthesis using Fine-Grained Reward","display_name":"Speech Recognition Model Improves Text-to-Speech Synthesis using Fine-Grained Reward","publication_year":2025,"publication_date":"2025-11-12","ids":{"openalex":"https://openalex.org/W7106654555","doi":"https://doi.org/10.48550/arxiv.2511.17555"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2511.17555","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2511.17555","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2511.17555","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Wang, Guansu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Guansu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Sun, Peijie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Peijie","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8317000269889832,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.8317000269889832,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.02410000003874302,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.021199999377131462,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.7387999892234802},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.5418000221252441},{"id":"https://openalex.org/keywords/estimator","display_name":"Estimator","score":0.4927000105381012},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.44679999351501465},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.3709999918937683},{"id":"https://openalex.org/keywords/regression","display_name":"Regression","score":0.35830000042915344},{"id":"https://openalex.org/keywords/mean-opinion-score","display_name":"Mean opinion score","score":0.3474999964237213},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.34709998965263367}],"concepts":[{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.7387999892234802},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7361000180244446},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6190999746322632},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.5418000221252441},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5325999855995178},{"id":"https://openalex.org/C185429906","wikidata":"https://www.wikidata.org/wiki/Q1130160","display_name":"Estimator","level":2,"score":0.4927000105381012},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.44679999351501465},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3709999918937683},{"id":"https://openalex.org/C83546350","wikidata":"https://www.wikidata.org/wiki/Q1139051","display_name":"Regression","level":2,"score":0.35830000042915344},{"id":"https://openalex.org/C62897895","wikidata":"https://www.wikidata.org/wiki/Q1915482","display_name":"Mean opinion score","level":3,"score":0.3474999964237213},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.34709998965263367},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3345000147819519},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.33180001378059387},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.32089999318122864},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.30090001225471497},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.2854999899864197},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.2847000062465668},{"id":"https://openalex.org/C152877465","wikidata":"https://www.wikidata.org/wiki/Q208042","display_name":"Regression analysis","level":2,"score":0.28220000863075256},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.2743000090122223},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2619999945163727},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.25760000944137573}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2511.17555","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2511.17555","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2511.17555","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2511.17555","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,38],"text-to-speech":[3],"(TTS)":[4],"have":[5],"enabled":[6],"models":[7,48,133],"to":[8,90],"clone":[9],"arbitrary":[10],"unseen":[11,119],"speakers":[12],"and":[13,56,94,114],"synthesize":[14],"high-quality,":[15],"natural-sounding":[16],"speech.":[17],"However,":[18],"evaluation":[19],"methods":[20],"lag":[21],"behind:":[22],"typical":[23],"mean":[24],"opinion":[25],"score":[26],"(MOS)":[27],"estimators":[28],"perform":[29],"regression":[30],"over":[31],"entire":[32],"utterances,":[33],"while":[34],"failures":[35],"usually":[36],"occur":[37],"a":[39,61,86,100,126],"few":[40],"problematic":[41],"words.":[42],"We":[43],"observe":[44],"that":[45,105],"encoder-decoder":[46],"ASR":[47,88],"(e.g.,":[49],"Whisper)":[50],"surface":[51],"word-level":[52],"mismatches":[53],"between":[54],"speech":[55],"text":[57],"via":[58],"cross-attention,":[59],"providing":[60],"fine-grained":[62,140],"reward":[63,80],"signal.":[64],"Building":[65],"on":[66,118],"this,":[67],"we":[68],"introduce":[69],"Word-level":[70],"TTS":[71,101,112],"Alignment":[72],"by":[73,99],"ASR-driven":[74],"Attentive":[75],"Reward":[76],"(W3AR).":[77],"Without":[78],"explicit":[79],"annotations,":[81],"W3AR":[82,106],"uses":[83],"attention":[84],"from":[85],"pre-trained":[87],"model":[89],"drive":[91],"finer-grained":[92],"alignment":[93],"optimization":[95],"of":[96,110],"sequences":[97],"predicted":[98],"model.":[102],"Experiments":[103],"show":[104],"improves":[107],"the":[108],"quality":[109],"existing":[111],"systems":[113],"strengthens":[115],"zero-shot":[116],"robustness":[117],"speakers.":[120],"More":[121],"broadly,":[122],"our":[123],"results":[124],"suggest":[125],"simple":[127],"recipe":[128],"for":[129,142],"generative":[130],"modeling:":[131],"understanding":[132],"can":[134],"act":[135],"as":[136],"evaluators,":[137],"delivering":[138],"informative,":[139],"feedback":[141],"optimization.":[143]},"counts_by_year":[],"updated_date":"2025-11-27T01:16:37.896743","created_date":"2025-11-27T00:00:00"}
