{"id":"https://openalex.org/W4408346048","doi":"https://doi.org/10.1109/icassp49660.2025.10890697","title":"Can We \"Cherry-Pick\"? Investigating Multiple Renditions from a Generative Speech Synthesis Model","display_name":"Can We \"Cherry-Pick\"? Investigating Multiple Renditions from a Generative Speech Synthesis Model","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408346048","doi":"https://doi.org/10.1109/icassp49660.2025.10890697"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10890697","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890697","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5053310461","display_name":"Adaeze Adigwe","orcid":null},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Adaeze Adigwe","raw_affiliation_strings":["University of Edinburgh,The Centre for Speech Technology Research,Edinburgh,UK"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh,The Centre for Speech Technology Research,Edinburgh,UK","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042248416","display_name":"Sarenne Wallbridge","orcid":"https://orcid.org/0000-0002-1401-4492"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Sarenne Wallbridge","raw_affiliation_strings":["University of Edinburgh,The Centre for Speech Technology Research,Edinburgh,UK"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh,The Centre for Speech Technology Research,Edinburgh,UK","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051713582","display_name":"Zehai Tu","orcid":null},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zehai Tu","raw_affiliation_strings":["Tencent,LightSpeed Studios"],"affiliations":[{"raw_affiliation_string":"Tencent,LightSpeed Studios","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011726172","display_name":"Catherine Lai","orcid":"https://orcid.org/0000-0001-8861-0281"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Catherine Lai","raw_affiliation_strings":["University of Edinburgh,The Centre for Speech Technology Research,Edinburgh,UK"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh,The Centre for Speech Technology Research,Edinburgh,UK","institution_ids":["https://openalex.org/I98677209"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101826175","display_name":"Simon King","orcid":"https://orcid.org/0000-0001-6769-5757"},"institutions":[{"id":"https://openalex.org/I98677209","display_name":"University of Edinburgh","ror":"https://ror.org/01nrxwf90","country_code":"GB","type":"education","lineage":["https://openalex.org/I98677209"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Simon King","raw_affiliation_strings":["University of Edinburgh,The Centre for Speech Technology Research,Edinburgh,UK"],"affiliations":[{"raw_affiliation_string":"University of Edinburgh,The Centre for Speech Technology Research,Edinburgh,UK","institution_ids":["https://openalex.org/I98677209"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5053310461"],"corresponding_institution_ids":["https://openalex.org/I98677209"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.01827244,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9535999894142151,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9535999894142151,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.642471432685852},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.5823279619216919},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.5615383982658386},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.5081944465637207},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.44845929741859436},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3847999572753906},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3517076075077057}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.642471432685852},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.5823279619216919},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.5615383982658386},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.5081944465637207},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.44845929741859436},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3847999572753906},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3517076075077057}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10890697","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890697","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W2763110165","https://openalex.org/W3024869864","https://openalex.org/W3161480375","https://openalex.org/W3193768936","https://openalex.org/W3196231874","https://openalex.org/W4372260337","https://openalex.org/W4381786045","https://openalex.org/W4385823027","https://openalex.org/W4385993911","https://openalex.org/W4391833199","https://openalex.org/W4402111523","https://openalex.org/W4402112013","https://openalex.org/W4402672020","https://openalex.org/W4408355874","https://openalex.org/W4411119606","https://openalex.org/W6750489868","https://openalex.org/W6790356757","https://openalex.org/W6848735303","https://openalex.org/W6861507043","https://openalex.org/W6865936227"],"related_works":["https://openalex.org/W4365211920","https://openalex.org/W3014948380","https://openalex.org/W4391584540","https://openalex.org/W4380551139","https://openalex.org/W4317695495","https://openalex.org/W4395044357","https://openalex.org/W4287117424","https://openalex.org/W4387506531","https://openalex.org/W2087346071","https://openalex.org/W2967848559"],"abstract_inverted_index":{"Generative":[0],"Speech":[1],"Models":[2],"(GSMs)":[3],"have":[4],"seen":[5],"a":[6,28,33,52,60,68,84,106,124,127,154,166,193],"surge":[7],"in":[8,196],"popularity":[9],"due":[10],"to":[11,14,58,105,115,136],"their":[12],"ability":[13],"generate":[15,23],"diverse":[16],"and":[17,73,110,148],"high-quality":[18],"speech.":[19],"Evaluating":[20],"models":[21],"that":[22,156,189],"many":[24],"different":[25],"renditions":[26,82,173],"for":[27,43,71,224],"given":[29],"input":[30],"sentence":[31],"presents":[32],"new":[34],"challenge.":[35],"Listening":[36],"tests":[37],"are":[38],"still":[39],"the":[40,144,150,175,182,208],"gold":[41],"standard":[42],"evaluating":[44,72],"synthetic":[45],"speech,":[46],"but":[47,198],"current":[48],"paradigms":[49],"only":[50],"consider":[51],"single":[53],"arbitrary":[54],"rendition:":[55],"this":[56,120,131],"fails":[57],"give":[59],"complete":[61],"picture":[62],"of":[63,86,126,130,146,174,184,210],"best/typical/worst-rendition":[64],"performance.":[65],"We":[66,142,187],"propose":[67],"general":[69],"framework":[70,101,132,203],"deploying":[74],"generative":[75,108],"speech":[76],"models.":[77],"This":[78],"involves":[79],"selecting":[80],"amongst":[81],"using":[83,92],"sequence":[85],"filtering":[87],"or":[88,96],"ranking":[89],"steps,":[90],"each":[91],"either":[93],"an":[94],"objective":[95,222],"subjective":[97,167],"(listening)":[98],"method.":[99],"The":[100],"is":[102,157,191,204,212],"not":[103,199],"tied":[104],"particular":[107],"model,":[109],"so":[111],"could":[112,217],"be":[113,218],"applied":[114],"any":[116],"such":[117],"model.":[118],"In":[119,165,206,214],"paper,":[121],"we":[122,180],"provide":[123],"demonstration":[125],"simple":[128],"version":[129],"which":[133,179],"would":[134],"apply":[135],"use-cases":[137],"where":[138],"best-rendition":[139],"performance":[140],"matters.":[141],"explore":[143],"concept":[145],"\"cherry-picking\",":[147],"ask":[149],"question":[151],"\"Is":[152],"there":[153,190],"rendition":[155,195],"consistently":[158],"preferred":[159,194],"above":[160],"all":[161,200],"others":[162],"by":[163],"listeners?\".":[164],"listening":[168],"test,":[169],"participants":[170],"ranked":[171],"several":[172],"same":[176],"sentence,":[177],"from":[178],"measured":[181],"prevalence":[183],"exceptional":[185],"renditions.":[186],"find":[188],"indeed":[192],"many,":[197],"cases.":[201],"Our":[202],"flexible.":[205],"particular,":[207],"use":[209],"listeners":[211],"optional.":[213],"future,":[215],"they":[216],"replaced":[219],"with":[220],"model-based":[221],"measures,":[223],"example.":[225]},"counts_by_year":[],"updated_date":"2025-12-28T23:10:05.387466","created_date":"2025-10-10T00:00:00"}
