{"id":"https://openalex.org/W4362654427","doi":"https://doi.org/10.1109/taslp.2023.3264454","title":"Artificial Vocal Learning Guided by Phoneme Recognition and Visual Information","display_name":"Artificial Vocal Learning Guided by Phoneme Recognition and Visual Information","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4362654427","doi":"https://doi.org/10.1109/taslp.2023.3264454"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2023.3264454","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3264454","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5004916681","display_name":"Paul Konstantin Krug","orcid":"https://orcid.org/0000-0001-8518-8142"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Paul Konstantin Krug","raw_affiliation_strings":["Institute of Acoustics and Speech Communication, Technische Universit&#x00E4;t, Dresden, Germany"],"raw_orcid":"https://orcid.org/0000-0001-8518-8142","affiliations":[{"raw_affiliation_string":"Institute of Acoustics and Speech Communication, Technische Universit&#x00E4;t, Dresden, Germany","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046141664","display_name":"Peter Birkholz","orcid":"https://orcid.org/0000-0003-0167-8123"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peter Birkholz","raw_affiliation_strings":["Institute of Acoustics and Speech Communication, Technische Universit&#x00E4;t, Dresden, Germany"],"raw_orcid":"https://orcid.org/0000-0003-0167-8123","affiliations":[{"raw_affiliation_string":"Institute of Acoustics and Speech Communication, Technische Universit&#x00E4;t, Dresden, Germany","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001239900","display_name":"Branislav Gerazov","orcid":"https://orcid.org/0000-0003-2498-6831"},"institutions":[{"id":"https://openalex.org/I76245029","display_name":"Ss. Cyril and Methodius University in Skopje","ror":"https://ror.org/02wk2vx54","country_code":"MK","type":"education","lineage":["https://openalex.org/I76245029"]}],"countries":["MK"],"is_corresponding":false,"raw_author_name":"Branislav Gerazov","raw_affiliation_strings":["Faculty of Electrical Engineering and Information Technologies, Ss. Cyril and Methodius University, Skopje, North Macedonia","Faculty of Electrical Engineering and Information Technologies, Ss. Cyril and Methodius University in Skopje, Republic of North Macedonia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Faculty of Electrical Engineering and Information Technologies, Ss. Cyril and Methodius University, Skopje, North Macedonia","institution_ids":["https://openalex.org/I76245029"]},{"raw_affiliation_string":"Faculty of Electrical Engineering and Information Technologies, Ss. Cyril and Methodius University in Skopje, Republic of North Macedonia","institution_ids":["https://openalex.org/I76245029"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057837548","display_name":"Daniel van Niekerk","orcid":"https://orcid.org/0000-0002-7324-2751"},"institutions":[{"id":"https://openalex.org/I45129253","display_name":"University College London","ror":"https://ror.org/02jx3x895","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I45129253"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Daniel Rudolph van Niekerk","raw_affiliation_strings":["Department of Speech, Hearing and Phonetic Sciences, University College London, London, U.K","Department of Speech, Hearing and Phonetic Sciences, University College London, United Kingdom"],"raw_orcid":"https://orcid.org/0000-0002-7324-2751","affiliations":[{"raw_affiliation_string":"Department of Speech, Hearing and Phonetic Sciences, University College London, London, U.K","institution_ids":["https://openalex.org/I45129253"]},{"raw_affiliation_string":"Department of Speech, Hearing and Phonetic Sciences, University College London, United Kingdom","institution_ids":["https://openalex.org/I45129253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022797639","display_name":"Anqi Xu","orcid":"https://orcid.org/0000-0002-4331-6676"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Anqi Xu","raw_affiliation_strings":["Harbin Institute of Technology, Shenzhen, China"],"raw_orcid":"https://orcid.org/0000-0002-4331-6676","affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Shenzhen, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5049702508","display_name":"Yi Xu","orcid":"https://orcid.org/0000-0002-8541-2658"},"institutions":[{"id":"https://openalex.org/I45129253","display_name":"University College London","ror":"https://ror.org/02jx3x895","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I45129253"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Yi Xu","raw_affiliation_strings":["Department of Speech, Hearing and Phonetic Sciences, University College London, London, U.K","Department of Speech, Hearing and Phonetic Sciences, University College London, United Kingdom"],"raw_orcid":"https://orcid.org/0000-0002-8541-2658","affiliations":[{"raw_affiliation_string":"Department of Speech, Hearing and Phonetic Sciences, University College London, London, U.K","institution_ids":["https://openalex.org/I45129253"]},{"raw_affiliation_string":"Department of Speech, Hearing and Phonetic Sciences, University College London, United Kingdom","institution_ids":["https://openalex.org/I45129253"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.1086,"has_fulltext":false,"cited_by_count":6,"citation_normalized_percentile":{"value":0.76841649,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":97},"biblio":{"volume":"31","issue":null,"first_page":"1734","last_page":"1744"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.736208438873291},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7174279093742371},{"id":"https://openalex.org/keywords/intelligibility","display_name":"Intelligibility (philosophy)","score":0.676671028137207},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.43329405784606934},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4244205951690674},{"id":"https://openalex.org/keywords/phonetics","display_name":"Phonetics","score":0.41890233755111694},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.41142722964286804}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.736208438873291},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7174279093742371},{"id":"https://openalex.org/C60048801","wikidata":"https://www.wikidata.org/wiki/Q1433889","display_name":"Intelligibility (philosophy)","level":2,"score":0.676671028137207},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43329405784606934},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4244205951690674},{"id":"https://openalex.org/C137584468","wikidata":"https://www.wikidata.org/wiki/Q35395","display_name":"Phonetics","level":2,"score":0.41890233755111694},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.41142722964286804},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2023.3264454","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taslp.2023.3264454","pdf_url":null,"source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.6899999976158142}],"awards":[{"id":"https://openalex.org/G1297994486","display_name":null,"funder_award_id":"RPG-2019-241","funder_id":"https://openalex.org/F4320319993","funder_display_name":"Leverhulme Trust"}],"funders":[{"id":"https://openalex.org/F4320319993","display_name":"Leverhulme Trust","ror":"https://ror.org/012mzw131"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":44,"referenced_works":["https://openalex.org/W98485554","https://openalex.org/W123135133","https://openalex.org/W130216483","https://openalex.org/W1595159159","https://openalex.org/W1963854000","https://openalex.org/W1965378753","https://openalex.org/W1995118599","https://openalex.org/W2011934537","https://openalex.org/W2016090750","https://openalex.org/W2024060531","https://openalex.org/W2041110166","https://openalex.org/W2057417891","https://openalex.org/W2058961190","https://openalex.org/W2066940820","https://openalex.org/W2077494771","https://openalex.org/W2097749765","https://openalex.org/W2110095567","https://openalex.org/W2126289105","https://openalex.org/W2149411572","https://openalex.org/W2172028426","https://openalex.org/W2184270665","https://openalex.org/W2232317135","https://openalex.org/W2284800790","https://openalex.org/W2286699414","https://openalex.org/W2290883490","https://openalex.org/W2546861836","https://openalex.org/W2581478358","https://openalex.org/W2798730128","https://openalex.org/W2911964244","https://openalex.org/W2912990735","https://openalex.org/W2964304707","https://openalex.org/W2972537746","https://openalex.org/W3025683731","https://openalex.org/W3027869639","https://openalex.org/W3095560934","https://openalex.org/W3125043549","https://openalex.org/W3135148245","https://openalex.org/W3193311858","https://openalex.org/W3193797593","https://openalex.org/W3216139160","https://openalex.org/W4296069280","https://openalex.org/W6685662889","https://openalex.org/W6777941750","https://openalex.org/W6804363299"],"related_works":["https://openalex.org/W2127461790","https://openalex.org/W2069324367","https://openalex.org/W2114688254","https://openalex.org/W2981428355","https://openalex.org/W1834994814","https://openalex.org/W2041273198","https://openalex.org/W1599055764","https://openalex.org/W2131711534","https://openalex.org/W2149163000","https://openalex.org/W2962858469"],"abstract_inverted_index":{"This":[0,58],"paper":[1],"introduces":[2],"a":[3,33,61,166],"paradigm":[4],"shift":[5],"regarding":[6],"vocal":[7,38],"learning":[8,20,30,39,62,134],"simulations,":[9],"in":[10,49,178],"which":[11],"the":[12,19,26,53,66,73,89,133,142,155,161,171,179],"communicative":[13],"function":[14,59],"of":[15,29,83,88,158,173],"speech":[16,54,69,115],"acquisition":[17,55],"determines":[18],"process":[21],"and":[22,91,120,129],"intelligibility":[23,157],"is":[24,40,111],"considered":[25],"primary":[27],"measure":[28],"success.":[31],"Thereby,":[32],"novel":[34],"approach":[35],"for":[36],"artificial":[37],"presented":[41],"that":[42,64,113,148,169],"utilizes":[43],"deep":[44],"neural":[45],"network-based":[46],"phoneme":[47],"recognition":[48],"order":[50],"to":[51],"calculate":[52],"objective":[56],"function.":[57],"guides":[60],"framework":[63,135],"involves":[65],"state-of-the-art":[67],"articulatory":[68,180],"synthesizer":[70],"VocalTractLab":[71],"as":[72,103,118,136,165],"motor-to-acoustic":[74],"forward":[75],"model.":[76],"In":[77],"this":[78,149],"way,":[79],"an":[80,137],"extensive":[81],"set":[82],"German":[84],"phonemes,":[85],"including":[86],"most":[87],"consonants":[90],"all":[92],"stressed":[93],"vowels,":[94],"was":[95,146],"produced":[96],"successfully.":[97],"The":[98],"synthetic":[99],"phonemes":[100],"were":[101],"rated":[102],"highly":[104],"intelligible":[105],"by":[106],"human":[107],"listeners.":[108],"Furthermore,":[109],"it":[110],"shown":[112],"visual":[114,150,162],"information,":[116],"such":[117],"lip":[119],"jaw":[121],"movements,":[122],"can":[123],"be":[124,130],"extracted":[125],"from":[126],"video":[127],"recordings":[128],"incorporated":[131],"into":[132],"additional":[138],"loss":[139,151,163],"component":[140],"during":[141],"optimization":[143],"process.":[144],"It":[145],"observed":[147],"did":[152],"not":[153],"increase":[154],"overall":[156],"phonemes.":[159],"Instead,":[160],"acted":[164],"regularization":[167],"mechanism":[168],"facilitated":[170],"finding":[172],"more":[174],"biologically":[175],"plausible":[176],"solutions":[177],"domain.":[181]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":3}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
