{"id":"https://openalex.org/W7160382906","doi":"https://doi.org/10.1016/j.ins.2026.123591","title":"WAVe: Word-aligned verification of synthetic speech for ASR","display_name":"WAVe: Word-aligned verification of synthetic speech for ASR","publication_year":2026,"publication_date":"2026-05-06","ids":{"openalex":"https://openalex.org/W7160382906","doi":"https://doi.org/10.1016/j.ins.2026.123591"},"language":"en","primary_location":{"id":"doi:10.1016/j.ins.2026.123591","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.ins.2026.123591","pdf_url":null,"source":{"id":"https://openalex.org/S192650101","display_name":"Information Sciences","issn_l":"0020-0255","issn":["0020-0255","1872-6291"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Information Sciences","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1016/j.ins.2026.123591","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114312941","display_name":"Yuriy Perezhohin","orcid":null},"institutions":[{"id":"https://openalex.org/I83558840","display_name":"Universidade Nova de Lisboa","ror":"https://ror.org/02xankh89","country_code":"PT","type":"education","lineage":["https://openalex.org/I83558840"]}],"countries":["PT"],"is_corresponding":false,"raw_author_name":"Yuriy Perezhohin","raw_affiliation_strings":["NOVA Information Management School (NOVA IMS), Universidade NOVA de Lisboa, Campus de Campolide, Lisboa, 1070-312, Portugal"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NOVA Information Management School (NOVA IMS), Universidade NOVA de Lisboa, Campus de Campolide, Lisboa, 1070-312, Portugal","institution_ids":["https://openalex.org/I83558840"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5087976149","display_name":"Mauro Castelli","orcid":"https://orcid.org/0000-0002-8793-1451"},"institutions":[{"id":"https://openalex.org/I83558840","display_name":"Universidade Nova de Lisboa","ror":"https://ror.org/02xankh89","country_code":"PT","type":"education","lineage":["https://openalex.org/I83558840"]}],"countries":["PT"],"is_corresponding":true,"raw_author_name":"Mauro Castelli","raw_affiliation_strings":["NOVA Information Management School (NOVA IMS), Universidade NOVA de Lisboa, Campus de Campolide, Lisboa, 1070-312, Portugal"],"raw_orcid":"https://orcid.org/0000-0002-8793-1451","affiliations":[{"raw_affiliation_string":"NOVA Information Management School (NOVA IMS), Universidade NOVA de Lisboa, Campus de Campolide, Lisboa, 1070-312, Portugal","institution_ids":["https://openalex.org/I83558840"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5087976149"],"corresponding_institution_ids":["https://openalex.org/I83558840"],"apc_list":{"value":3330,"currency":"USD","value_usd":3330},"apc_paid":{"value":3330,"currency":"USD","value_usd":3330},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93200794,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":"752","issue":null,"first_page":"123591","last_page":"123591"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9158999919891357,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9158999919891357,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.05790000036358833,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.0035000001080334187,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5699999928474426},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5557000041007996},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.5216000080108643},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.47099998593330383},{"id":"https://openalex.org/keywords/synthetic-data","display_name":"Synthetic data","score":0.42910000681877136},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.41119998693466187},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.39070001244544983}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8296999931335449},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7128000259399414},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5699999928474426},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5557000041007996},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.5216000080108643},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.47099998593330383},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4388999938964844},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.42910000681877136},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.41119998693466187},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.39070001244544983},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3896999955177307},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.33410000801086426},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.3296000063419342},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32690000534057617},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.29600000381469727},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.2840999960899353},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.2816999852657318},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.26249998807907104},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.25429999828338623}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1016/j.ins.2026.123591","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.ins.2026.123591","pdf_url":null,"source":{"id":"https://openalex.org/S192650101","display_name":"Information Sciences","issn_l":"0020-0255","issn":["0020-0255","1872-6291"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Information Sciences","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1016/j.ins.2026.123591","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.ins.2026.123591","pdf_url":null,"source":{"id":"https://openalex.org/S192650101","display_name":"Information Sciences","issn_l":"0020-0255","issn":["0020-0255","1872-6291"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Information Sciences","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.5575686097145081,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320320300","display_name":"European Commission","ror":"https://ror.org/00k4n6c32"},{"id":"https://openalex.org/F4320334779","display_name":"Funda\u00e7\u00e3o para a Ci\u00eancia e a Tecnologia","ror":"https://ror.org/00snfqn58"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":4,"referenced_works":["https://openalex.org/W3100732527","https://openalex.org/W4252684946","https://openalex.org/W4403510311","https://openalex.org/W4403652896"],"related_works":[],"abstract_inverted_index":{"Automatic":[0],"speech":[1,14,45],"recognition":[2],"for":[3,179],"low-resource":[4],"languages":[5],"often":[6],"relies":[7],"on":[8,129,154],"synthetic":[9,33,74,94,180],"utterances":[10,17],"to":[11,167,194],"augment":[12],"limited":[13],"data;":[15],"these":[16],"are":[18],"generated":[19,68],"by":[20,57,142],"pairing":[21],"large-language-model":[22],"transcripts":[23],"with":[24,61,90,116],"neural":[25],"text-to-speech":[26],"(TTS)":[27],"audio.":[28],"However,":[29],"indiscriminate":[30],"incorporation":[31],"of":[32,93,101,139,188],"audio":[34,62,75],"can":[35],"reduce":[36],"training":[37,86,140],"efficiency":[38],"and":[39,71,79,96,186],"introduce":[40,48],"errors":[41],"such":[42],"as":[43,173],"unnatural":[44],"patterns.":[46],"We":[47,83],"WAVe,":[49,66],"a":[50,80,111,119],"model":[51,123],"that":[52,160],"verifies":[53],"word-to-audio":[54],"frame":[55],"correspondence":[56],"aligning":[58],"text":[59],"representations":[60],"features.":[63],"To":[64],"evaluate":[65],"we":[67],"22k":[69],"Portuguese":[70],"35k":[72],"Dutch":[73,130],"samples":[76],"using":[77],"GPT-4o-mini":[78],"TTS":[81],"system.":[82],"created":[84],"four":[85],"subsets":[87],"per":[88],"language":[89],"varying":[91],"proportions":[92],"data":[95,181],"fine-tuned":[97],"three":[98],"Whisper":[99],"models":[100],"different":[102],"sizes.":[103],"For":[104],"Portuguese,":[105],"our":[106],"high-quality":[107],"29k-sample":[108],"subset":[109],"achieved":[110],"7.9%":[112],"word":[113],"error":[114],"rate":[115],"Whisper-Large-v3,":[117],"outperforming":[118],"recent":[120],"competitive":[121],"55k-sample":[122],"trained":[124],"under":[125],"identical":[126],"conditions.":[127],"Experiments":[128],"showed":[131],"similarly":[132],"consistent":[133],"improvements.":[134],"WAVe":[135,172],"reduces":[136,163],"the":[137,155,184],"number":[138],"steps":[141],"34%,":[143],"substantially":[144],"lowering":[145],"computational":[146],"cost":[147],"while":[148],"improving":[149],"ASR":[150,195],"quality.":[151],"Cross-domain":[152],"evaluation":[153],"Multilingual":[156],"LibriSpeech":[157],"benchmark":[158],"demonstrates":[159],"WAVe-based":[161],"filtering":[162],"WER":[164],"from":[165],"13.54%":[166],"6.89%.":[168],"These":[169],"results":[170],"establish":[171],"an":[174],"effective":[175],"quality":[176],"control":[177],"mechanism":[178],"pipelines,":[182],"enabling":[183],"identification":[185],"removal":[187],"poorly":[189],"synthesized":[190],"audio-text":[191],"pairs":[192],"prior":[193],"fine-tuning.":[196]},"counts_by_year":[],"updated_date":"2026-05-09T06:09:20.037420","created_date":"2026-05-07T00:00:00"}
