{"id":"https://openalex.org/W3195797859","doi":"https://doi.org/10.21437/ssw.2021-10","title":"Adaptation of Tacotron2-based Text-To-Speech for Articulatory-to-Acoustic Mapping using Ultrasound Tongue Imaging","display_name":"Adaptation of Tacotron2-based Text-To-Speech for Articulatory-to-Acoustic Mapping using Ultrasound Tongue Imaging","publication_year":2021,"publication_date":"2021-01-01","ids":{"openalex":"https://openalex.org/W3195797859","doi":"https://doi.org/10.21437/ssw.2021-10","mag":"3195797859"},"language":"en","primary_location":{"id":"pmh:oai:publicatio.bibl.u-szeged.hu:25203","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4306400436","display_name":"SZTE Publicatio Repozit\u00f3rium (University of Szeged)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I227486990","host_organization_name":"University of Szeged","host_organization_lineage":["https://openalex.org/I227486990"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":"","raw_type":"K\u00f6nyv r\u00e9sze"},"type":"article","indexed_in":[],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5031237235","display_name":"Csaba Zaink\u00f3","orcid":null},"institutions":[{"id":"https://openalex.org/I29770179","display_name":"Budapest University of Technology and Economics","ror":"https://ror.org/02w42ss30","country_code":"HU","type":"education","lineage":["https://openalex.org/I29770179"]}],"countries":["HU"],"is_corresponding":false,"raw_author_name":"Zaink\u00f3 Csaba","raw_affiliation_strings":["Department of Telecommunications and Media Informatics, Budapest University of Technology and Economics, Budapest, Hungary"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Telecommunications and Media Informatics, Budapest University of Technology and Economics, Budapest, Hungary","institution_ids":["https://openalex.org/I29770179"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020608163","display_name":"L\u00e1szl\u00f3 T\u00f3th","orcid":"https://orcid.org/0000-0003-0161-1375"},"institutions":[{"id":"https://openalex.org/I227486990","display_name":"University of Szeged","ror":"https://ror.org/01pnej532","country_code":"HU","type":"education","lineage":["https://openalex.org/I227486990"]}],"countries":["HU"],"is_corresponding":false,"raw_author_name":"T\u00f3th L\u00e1szl\u00f3","raw_affiliation_strings":["Institute of Informatics, University of Szeged, Hungary"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute of Informatics, University of Szeged, Hungary","institution_ids":["https://openalex.org/I227486990"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Honarmandi Shandiz Amin","orcid":null},"institutions":[{"id":"https://openalex.org/I227486990","display_name":"University of Szeged","ror":"https://ror.org/01pnej532","country_code":"HU","type":"education","lineage":["https://openalex.org/I227486990"]}],"countries":["HU"],"is_corresponding":false,"raw_author_name":"Honarmandi Shandiz Amin","raw_affiliation_strings":["Institute of Informatics, University of Szeged, Hungary"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute of Informatics, University of Szeged, Hungary","institution_ids":["https://openalex.org/I227486990"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Gosztolya G\u00e1bor","orcid":null},"institutions":[{"id":"https://openalex.org/I4210152167","display_name":"MTA-SZTE Research Group on Artificial Intelligence","ror":"https://ror.org/0507fk326","country_code":"HU","type":"facility","lineage":["https://openalex.org/I227486990","https://openalex.org/I4210152167","https://openalex.org/I7597260"]}],"countries":["HU"],"is_corresponding":false,"raw_author_name":"Gosztolya G\u00e1bor","raw_affiliation_strings":["MTA-SZTE Research Group on Artificial Intelligence, Szeged, Hungary"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"MTA-SZTE Research Group on Artificial Intelligence, Szeged, Hungary","institution_ids":["https://openalex.org/I4210152167"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085147054","display_name":"Alexandra Mark\u00f3","orcid":"https://orcid.org/0000-0003-0301-7134"},"institutions":[{"id":"https://openalex.org/I106118109","display_name":"E\u00f6tv\u00f6s Lor\u00e1nd University","ror":"https://ror.org/01jsq2704","country_code":"HU","type":"education","lineage":["https://openalex.org/I106118109"]},{"id":"https://openalex.org/I2802350943","display_name":"ELTE Research Centre for Linguistics","ror":"https://ror.org/005cqsz63","country_code":"HU","type":"facility","lineage":["https://openalex.org/I2802350943"]}],"countries":["HU"],"is_corresponding":true,"raw_author_name":"Mark\u00f3 Alexandra","raw_affiliation_strings":["Department of Applied Linguistics and Phonetics, E\u00f6tv\u00f6s Lor\u00e1nd University, Budapest, Hungary","MTA-ELTE Lend\u00fclet Lingual Articulation Research Group, Budapest, Hungary"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Applied Linguistics and Phonetics, E\u00f6tv\u00f6s Lor\u00e1nd University, Budapest, Hungary","institution_ids":["https://openalex.org/I106118109"]},{"raw_affiliation_string":"MTA-ELTE Lend\u00fclet Lingual Articulation Research Group, Budapest, Hungary","institution_ids":["https://openalex.org/I2802350943","https://openalex.org/I106118109"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069988513","display_name":"G\u00e9za N\u00e9meth","orcid":"https://orcid.org/0000-0002-2311-4858"},"institutions":[{"id":"https://openalex.org/I29770179","display_name":"Budapest University of Technology and Economics","ror":"https://ror.org/02w42ss30","country_code":"HU","type":"education","lineage":["https://openalex.org/I29770179"]}],"countries":["HU"],"is_corresponding":false,"raw_author_name":"N\u00e9meth G\u00e9za","raw_affiliation_strings":["Department of Telecommunications and Media Informatics, Budapest University of Technology and Economics, Budapest, Hungary"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Telecommunications and Media Informatics, Budapest University of Technology and Economics, Budapest, Hungary","institution_ids":["https://openalex.org/I29770179"]}]},{"author_position":"last","author":{"id":null,"display_name":"Csap\u00f3 Tam\u00e1s G\u00e1bor","orcid":null},"institutions":[{"id":"https://openalex.org/I106118109","display_name":"E\u00f6tv\u00f6s Lor\u00e1nd University","ror":"https://ror.org/01jsq2704","country_code":"HU","type":"education","lineage":["https://openalex.org/I106118109"]},{"id":"https://openalex.org/I2802350943","display_name":"ELTE Research Centre for Linguistics","ror":"https://ror.org/005cqsz63","country_code":"HU","type":"facility","lineage":["https://openalex.org/I2802350943"]},{"id":"https://openalex.org/I29770179","display_name":"Budapest University of Technology and Economics","ror":"https://ror.org/02w42ss30","country_code":"HU","type":"education","lineage":["https://openalex.org/I29770179"]}],"countries":["HU"],"is_corresponding":false,"raw_author_name":"Csap\u00f3 Tam\u00e1s G\u00e1bor","raw_affiliation_strings":["MTA-ELTE Lend\u00fclet Lingual Articulation Research Group, Budapest, Hungary","Department of Telecommunications and Media Informatics, Budapest University of Technology and Economics, Budapest, Hungary"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"MTA-ELTE Lend\u00fclet Lingual Articulation Research Group, Budapest, Hungary","institution_ids":["https://openalex.org/I2802350943","https://openalex.org/I106118109"]},{"raw_affiliation_string":"Department of Telecommunications and Media Informatics, Budapest University of Technology and Economics, Budapest, Hungary","institution_ids":["https://openalex.org/I29770179"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5085147054"],"corresponding_institution_ids":["https://openalex.org/I106118109","https://openalex.org/I2802350943"],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9962000250816345,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.9878000020980835,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6890420913696289},{"id":"https://openalex.org/keywords/adaptation","display_name":"Adaptation (eye)","score":0.6807650327682495},{"id":"https://openalex.org/keywords/tongue","display_name":"Tongue","score":0.6355090141296387},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5934143662452698},{"id":"https://openalex.org/keywords/ultrasound-imaging","display_name":"Ultrasound imaging","score":0.4452715814113617},{"id":"https://openalex.org/keywords/speech-production","display_name":"Speech production","score":0.44026267528533936},{"id":"https://openalex.org/keywords/ultrasound","display_name":"Ultrasound","score":0.42444950342178345},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.42200466990470886},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.0821075439453125},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.06932705640792847}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6890420913696289},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.6807650327682495},{"id":"https://openalex.org/C2779744641","wikidata":"https://www.wikidata.org/wiki/Q9614","display_name":"Tongue","level":2,"score":0.6355090141296387},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5934143662452698},{"id":"https://openalex.org/C2986892559","wikidata":"https://www.wikidata.org/wiki/Q234904","display_name":"Ultrasound imaging","level":3,"score":0.4452715814113617},{"id":"https://openalex.org/C43617652","wikidata":"https://www.wikidata.org/wiki/Q7575399","display_name":"Speech production","level":2,"score":0.44026267528533936},{"id":"https://openalex.org/C143753070","wikidata":"https://www.wikidata.org/wiki/Q162564","display_name":"Ultrasound","level":2,"score":0.42444950342178345},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.42200466990470886},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0821075439453125},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.06932705640792847},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"pmh:oai:publicatio.bibl.u-szeged.hu:25203","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4306400436","display_name":"SZTE Publicatio Repozit\u00f3rium (University of Szeged)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I227486990","host_organization_name":"University of Szeged","host_organization_lineage":["https://openalex.org/I227486990"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":"","raw_type":"K\u00f6nyv r\u00e9sze"}],"best_oa_location":{"id":"pmh:oai:publicatio.bibl.u-szeged.hu:25203","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4306400436","display_name":"SZTE Publicatio Repozit\u00f3rium (University of Szeged)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I227486990","host_organization_name":"University of Szeged","host_organization_lineage":["https://openalex.org/I227486990"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":"","raw_type":"K\u00f6nyv r\u00e9sze"},"sustainable_development_goals":[{"score":0.7099999785423279,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W1983364832","https://openalex.org/W1995735739","https://openalex.org/W2008120082","https://openalex.org/W2145442746","https://openalex.org/W2163605009","https://openalex.org/W2294901616","https://openalex.org/W2296650368","https://openalex.org/W2419247625","https://openalex.org/W2482033662","https://openalex.org/W2515755543","https://openalex.org/W2519091744","https://openalex.org/W2546134918","https://openalex.org/W2585227925","https://openalex.org/W2585824449","https://openalex.org/W2603597171","https://openalex.org/W2746109435","https://openalex.org/W2768153200","https://openalex.org/W2770785043","https://openalex.org/W2888796252","https://openalex.org/W2888815552","https://openalex.org/W2888917702","https://openalex.org/W2889853672","https://openalex.org/W2952436057","https://openalex.org/W2955236751","https://openalex.org/W2963155035","https://openalex.org/W2963300588","https://openalex.org/W2964243274","https://openalex.org/W2972959620","https://openalex.org/W2978506536","https://openalex.org/W3013952251","https://openalex.org/W3083438180","https://openalex.org/W3086311074","https://openalex.org/W3094463005","https://openalex.org/W3095801312","https://openalex.org/W3135248196","https://openalex.org/W3143787022","https://openalex.org/W3157840621","https://openalex.org/W3158448163","https://openalex.org/W3173314831","https://openalex.org/W3174954176","https://openalex.org/W3217314539","https://openalex.org/W4308506949"],"related_works":["https://openalex.org/W4250133764","https://openalex.org/W2391770021","https://openalex.org/W2613177006","https://openalex.org/W4390298813","https://openalex.org/W2990800024","https://openalex.org/W4247495643","https://openalex.org/W3032126157","https://openalex.org/W2182698094","https://openalex.org/W4240101417","https://openalex.org/W2370916954"],"abstract_inverted_index":{"For":[0],"articulatory-to-acoustic":[1,43,62],"mapping,":[2],"typically":[3],"only":[4],"limited":[5,47],"parallel":[6],"training":[7],"data":[8,120],"is":[9,106,133,171],"available,":[10],"making":[11],"it":[12],"impossible":[13],"to":[14,35,97],"apply":[15],"fully":[16],"end-to-end":[17],"solutions":[18,177],"like":[19],"Tacotron2.In":[20],"this":[21,94],"paper,":[22],"we":[23,164],"experimented":[24],"with":[25,45,174,179],"transfer":[26],"learning":[27],"and":[28,56,101,129],"adaptation":[29],"of":[30,41,71,84,116,143],"a":[31,46,50,57,69,76],"Tacotron2":[32,53,87,91,137,160],"text-to-speech":[33],"model":[34,55,92,105],"improve":[36],"the":[37,82,85,90,103,114,117,122,126,130,136,144,150,158,167,175],"final":[38,109],"synthesis":[39],"quality":[40,170],"ultrasound-based":[42],"mapping":[44],"database.We":[48],"use":[49],"multi-speaker":[51],"pre-trained":[52,58,86],"TTS":[54],"WaveGlow":[59,104],"neural":[60,79],"vocoder.The":[61],"conversion":[63],"contains":[64,113],"three":[65],"steps:":[66],"1)":[67],"from":[68,121,157],"sequence":[70],"ultrasound":[72,123,146],"tongue":[73],"image":[74],"recordings,":[75],"3D":[77],"convolutional":[78],"network":[80],"predicts":[81],"inputs":[83],"model,":[88],"2)":[89],"converts":[93],"intermediate":[95],"representation":[96],"an":[98],"80-dimensional":[99],"mel-spectrogram,":[100],"3)":[102],"applied":[107],"for":[108],"inference.This":[110],"generated":[111],"speech":[112,169],"timing":[115],"original":[118,145],"articulatory":[119],"recording,":[124],"but":[125,148],"F0":[127,139],"contour":[128],"spectral":[131],"information":[132],"predicted":[134],"by":[135],"model.The":[138],"values":[140],"are":[141,155],"independent":[142],"images,":[147],"represent":[149],"target":[151],"speaker,":[152],"as":[153],"they":[154],"inferred":[156],"pretrained":[159],"model.In":[161],"our":[162,180],"experiments,":[163],"demonstrated":[165],"that":[166],"synthesized":[168],"more":[172],"natural":[173],"proposed":[176],"than":[178],"earlier":[181],"model.":[182]},"counts_by_year":[{"year":2024,"cited_by_count":3}],"updated_date":"2026-04-28T14:05:53.105641","created_date":"2021-08-30T00:00:00"}
