{"id":"https://openalex.org/W4416771703","doi":"https://doi.org/10.1109/sped67700.2025.11251920","title":"Latent Insights: Exploring Phoneme Diversity in Natural and Synthetic Speech through Latent Representations","display_name":"Latent Insights: Exploring Phoneme Diversity in Natural and Synthetic Speech through Latent Representations","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4416771703","doi":"https://doi.org/10.1109/sped67700.2025.11251920"},"language":null,"primary_location":{"id":"doi:10.1109/sped67700.2025.11251920","is_oa":false,"landing_page_url":"https://doi.org/10.1109/sped67700.2025.11251920","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Speech Technology and Human-Computer Dialogue (SpeD)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5040987999","display_name":"Diptasree Debnath","orcid":"https://orcid.org/0000-0003-4676-0745"},"institutions":[{"id":"https://openalex.org/I100930933","display_name":"University College Dublin","ror":"https://ror.org/05m7pjf47","country_code":"IE","type":"education","lineage":["https://openalex.org/I100930933"]}],"countries":["IE"],"is_corresponding":true,"raw_author_name":"Diptasree Debnath","raw_affiliation_strings":["University College Dublin,School of Computer Science,Dublin,Ireland"],"affiliations":[{"raw_affiliation_string":"University College Dublin,School of Computer Science,Dublin,Ireland","institution_ids":["https://openalex.org/I100930933"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014946755","display_name":"Helard Becerra Martinez","orcid":"https://orcid.org/0000-0003-2652-3195"},"institutions":[{"id":"https://openalex.org/I100930933","display_name":"University College Dublin","ror":"https://ror.org/05m7pjf47","country_code":"IE","type":"education","lineage":["https://openalex.org/I100930933"]}],"countries":["IE"],"is_corresponding":false,"raw_author_name":"Helard Becerra Martinez","raw_affiliation_strings":["University College Dublin,School of Computer Science,Dublin,Ireland"],"affiliations":[{"raw_affiliation_string":"University College Dublin,School of Computer Science,Dublin,Ireland","institution_ids":["https://openalex.org/I100930933"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5008608844","display_name":"Andrew Hines","orcid":"https://orcid.org/0000-0001-9636-2556"},"institutions":[{"id":"https://openalex.org/I100930933","display_name":"University College Dublin","ror":"https://ror.org/05m7pjf47","country_code":"IE","type":"education","lineage":["https://openalex.org/I100930933"]}],"countries":["IE"],"is_corresponding":false,"raw_author_name":"Andrew Hines","raw_affiliation_strings":["University College Dublin,School of Computer Science,Dublin,Ireland"],"affiliations":[{"raw_affiliation_string":"University College Dublin,School of Computer Science,Dublin,Ireland","institution_ids":["https://openalex.org/I100930933"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5040987999"],"corresponding_institution_ids":["https://openalex.org/I100930933"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.20612659,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"69","last_page":"74"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.44449999928474426,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.44449999928474426,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.11410000175237656,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.10419999808073044,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.7821000218391418},{"id":"https://openalex.org/keywords/synthetic-data","display_name":"Synthetic data","score":0.6876000165939331},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.6567000150680542},{"id":"https://openalex.org/keywords/natural","display_name":"Natural (archaeology)","score":0.6008999943733215},{"id":"https://openalex.org/keywords/diversity","display_name":"Diversity (politics)","score":0.47209998965263367},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.3921000063419342},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.33649998903274536}],"concepts":[{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.7821000218391418},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.6876000165939331},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.6567000150680542},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6416000127792358},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.6008999943733215},{"id":"https://openalex.org/C2781316041","wikidata":"https://www.wikidata.org/wiki/Q1230584","display_name":"Diversity (politics)","level":2,"score":0.47209998965263367},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4339999854564667},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4221999943256378},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41769999265670776},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3921000063419342},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.33649998903274536},{"id":"https://openalex.org/C2780646970","wikidata":"https://www.wikidata.org/wiki/Q6980787","display_name":"Natural sounds","level":2,"score":0.3208000063896179},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3057999908924103},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.2962999939918518},{"id":"https://openalex.org/C542774811","wikidata":"https://www.wikidata.org/wiki/Q10880526","display_name":"Prosody","level":2,"score":0.29499998688697815},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.29159998893737793},{"id":"https://openalex.org/C2781202465","wikidata":"https://www.wikidata.org/wiki/Q18346297","display_name":"Lexical diversity","level":3,"score":0.26759999990463257},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.25699999928474426}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/sped67700.2025.11251920","is_oa":false,"landing_page_url":"https://doi.org/10.1109/sped67700.2025.11251920","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Speech Technology and Human-Computer Dialogue (SpeD)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":17,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1986595082","https://openalex.org/W2079623482","https://openalex.org/W2747874407","https://openalex.org/W2972359262","https://openalex.org/W2992384298","https://openalex.org/W3175537473","https://openalex.org/W3205635414","https://openalex.org/W3209059054","https://openalex.org/W4280596009","https://openalex.org/W4281492411","https://openalex.org/W4294903856","https://openalex.org/W4385822832","https://openalex.org/W4385823003","https://openalex.org/W4392981073","https://openalex.org/W4403653524","https://openalex.org/W4406417959"],"related_works":[],"abstract_inverted_index":{"The":[0],"growing":[1],"use":[2],"of":[3,34,148,162],"synthetic":[4,35,60,83,114,126,166,196,211],"speech":[5,17,36,61,84,127,197],"highlights":[6],"the":[7,32,53,131,144,173],"need":[8,54],"to":[9,58,95,135,193,204],"understand":[10],"its":[11],"differences":[12,79,186],"from":[13,72],"natural":[14,81,124],"speech.":[15,115,212],"Synthetic":[16],"provides":[18],"potential":[19],"advantages":[20],"in":[21,113,165,187],"data":[22,29,138],"augmentation,":[23],"including":[24],"privacy,":[25],"security,":[26],"and":[27,41,48,63,77,82,90,125,142,146,170,177,184,202],"ethical":[28],"sourcing.":[30],"However,":[31],"naturalness":[33],"is":[37,94],"both":[38],"a":[39,42,159],"limitation":[40],"risk,":[43],"with":[44,139,168],"deepfakes":[45],"facilitating":[46],"misinformation":[47],"fraud.":[49],"These":[50],"challenges":[51],"underscore":[52],"for":[55,210],"improved":[56],"methods":[57],"evaluate":[59],"quality":[62,207],"deepfake":[64],"detection.":[65],"This":[66],"research":[67],"investigates":[68],"whether":[69],"latent":[70,150,188],"representations":[71,151,189],"self-supervised":[73],"models":[74,98,121],"can":[75,190],"identify":[76],"quantify":[78],"between":[80],"regarding":[85],"phoneme":[86,101],"type,":[87],"stress,":[88],"manner,":[89],"roundedness.":[91],"Our":[92,156],"objective":[93],"determine":[96],"if":[97,105],"learn":[99],"all":[100],"categories":[102],"equally":[103],"or":[104],"certain":[106],"groups":[107],"present":[108],"greater":[109],"challenges,":[110],"revealing":[111],"limitations":[112],"We":[116,129],"pre-trained":[117],"two":[118],"wav2vec":[119],"2.0":[120],"using":[122],"matched":[123],"datasets.":[128],"mapped":[130],"learned":[132],"codeword":[133],"dictionaries":[134],"labeled":[136],"test":[137],"phoneme-level":[140],"annotations":[141],"analysed":[143],"distribution":[145],"diversity":[147,164],"these":[149],"across":[152],"different":[153],"phonemic":[154],"categories.":[155],"findings":[157],"indicate":[158],"general":[160],"lack":[161],"phonetic":[163],"speech,":[167],"stress":[169],"manner":[171],"showing":[172],"largest":[174],"disparities.":[175],"Vowels":[176],"diphthongs":[178],"consistently":[179],"exhibit":[180],"reduced":[181],"diversity.":[182],"Identifying":[183],"quantifying":[185],"be":[191],"applied":[192],"enable":[194],"enhanced":[195],"generation,":[198],"improve":[199],"classification":[200],"accuracy,":[201],"help":[203],"develop":[205],"robust":[206],"measurement":[208],"metrics":[209]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-27T00:00:00"}
