{"id":"https://openalex.org/W4408345597","doi":"https://doi.org/10.1109/icassp49660.2025.10889764","title":"Toward Visual Pronunciation Learning: A Speech-to-Articulatory Animation Pipeline Leveraging wav2vec 2.0 and rtMRI Landmarks","display_name":"Toward Visual Pronunciation Learning: A Speech-to-Articulatory Animation Pipeline Leveraging wav2vec 2.0 and rtMRI Landmarks","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408345597","doi":"https://doi.org/10.1109/icassp49660.2025.10889764"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10889764","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10889764","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5116591555","display_name":"Mushaffa Rasyid Ridha","orcid":null},"institutions":[{"id":"https://openalex.org/I177738480","display_name":"Japan Advanced Institute of Science and Technology","ror":"https://ror.org/03frj4r98","country_code":"JP","type":"education","lineage":["https://openalex.org/I177738480"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Mushaffa Rasyid Ridha","raw_affiliation_strings":["Japan Adv. Inst. of Science &amp; Tech.,Grad. School of Adv. Science &amp; Tech.,Nomi,Japan"],"affiliations":[{"raw_affiliation_string":"Japan Adv. Inst. of Science &amp; Tech.,Grad. School of Adv. Science &amp; Tech.,Nomi,Japan","institution_ids":["https://openalex.org/I177738480"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014332130","display_name":"Shinobu Hasegawa","orcid":"https://orcid.org/0000-0002-0892-9629"},"institutions":[{"id":"https://openalex.org/I177738480","display_name":"Japan Advanced Institute of Science and Technology","ror":"https://ror.org/03frj4r98","country_code":"JP","type":"education","lineage":["https://openalex.org/I177738480"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shinobu Hasegawa","raw_affiliation_strings":["Japan Adv. Inst. of Science &amp; Tech.,Center of IDER,Nomi,Japan"],"affiliations":[{"raw_affiliation_string":"Japan Adv. Inst. of Science &amp; Tech.,Center of IDER,Nomi,Japan","institution_ids":["https://openalex.org/I177738480"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5040108974","display_name":"Sakriani Sakti","orcid":"https://orcid.org/0000-0001-5509-8963"},"institutions":[{"id":"https://openalex.org/I75917431","display_name":"Nara Institute of Science and Technology","ror":"https://ror.org/05bhada84","country_code":"JP","type":"education","lineage":["https://openalex.org/I75917431"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Sakriani Sakti","raw_affiliation_strings":["Inst. of Science &amp; Tech.,Grad. School of Science &amp; Tech. Nara,Ikoma,Japan"],"affiliations":[{"raw_affiliation_string":"Inst. of Science &amp; Tech.,Grad. School of Science &amp; Tech. Nara,Ikoma,Japan","institution_ids":["https://openalex.org/I75917431"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5116591555"],"corresponding_institution_ids":["https://openalex.org/I177738480"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.05941532,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9761999845504761,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9761999845504761,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9672999978065491,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.9646000266075134,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pronunciation","display_name":"Pronunciation","score":0.814102053642273},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7928045392036438},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.7195002436637878},{"id":"https://openalex.org/keywords/animation","display_name":"Animation","score":0.683122456073761},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6174966096878052},{"id":"https://openalex.org/keywords/coarticulation","display_name":"Coarticulation","score":0.4692442715167999},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4244886338710785},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.33728474378585815},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.15530377626419067},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.08979222178459167}],"concepts":[{"id":"https://openalex.org/C2780844864","wikidata":"https://www.wikidata.org/wiki/Q184377","display_name":"Pronunciation","level":2,"score":0.814102053642273},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7928045392036438},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.7195002436637878},{"id":"https://openalex.org/C502989409","wikidata":"https://www.wikidata.org/wiki/Q11425","display_name":"Animation","level":2,"score":0.683122456073761},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6174966096878052},{"id":"https://openalex.org/C130727458","wikidata":"https://www.wikidata.org/wiki/Q1639109","display_name":"Coarticulation","level":3,"score":0.4692442715167999},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4244886338710785},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.33728474378585815},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.15530377626419067},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.08979222178459167},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C2779581591","wikidata":"https://www.wikidata.org/wiki/Q36244","display_name":"Vowel","level":2,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10889764","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10889764","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6499999761581421,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W1499800345","https://openalex.org/W1589867908","https://openalex.org/W1748856376","https://openalex.org/W2016538560","https://openalex.org/W2048449762","https://openalex.org/W2066381494","https://openalex.org/W2112890390","https://openalex.org/W2127141656","https://openalex.org/W2158203944","https://openalex.org/W2888815947","https://openalex.org/W2973049979","https://openalex.org/W3015213852","https://openalex.org/W3047225699","https://openalex.org/W3196525293","https://openalex.org/W3198275944","https://openalex.org/W3198439430","https://openalex.org/W3209059054","https://openalex.org/W4223651314","https://openalex.org/W4312069033","https://openalex.org/W4312617489","https://openalex.org/W4385823114","https://openalex.org/W4385987299","https://openalex.org/W4386857883","https://openalex.org/W4392902653","https://openalex.org/W4402112283","https://openalex.org/W4402112349","https://openalex.org/W6631190155","https://openalex.org/W6641416924","https://openalex.org/W6657871433","https://openalex.org/W6737202491","https://openalex.org/W6761970339","https://openalex.org/W6766978945","https://openalex.org/W6770514103","https://openalex.org/W6780218876"],"related_works":["https://openalex.org/W2990278625","https://openalex.org/W1993499045","https://openalex.org/W1604595447","https://openalex.org/W4200613756","https://openalex.org/W2983489261","https://openalex.org/W2988239832","https://openalex.org/W2075885261","https://openalex.org/W2174601236","https://openalex.org/W3036714870","https://openalex.org/W2172236910"],"abstract_inverted_index":{"Most":[0],"computer-assisted":[1],"pronunciation":[2,21,139],"training":[3],"(CAPT)":[4],"systems":[5,25],"for":[6,37,86],"second":[7],"language":[8],"(L2)":[9],"learners":[10],"focus":[11],"on":[12,16,56,112],"detecting":[13],"mispronunciation":[14],"based":[15,111],"predefined":[17],"phonemes":[18],"and":[19,73],"assigning":[20],"scores.":[22],"However,":[23],"these":[24],"often":[26],"lack":[27],"visual":[28,54,127,138],"feedback":[29,55],"or":[30],"detailed":[31,53],"corrective":[32],"guidance,":[33],"limiting":[34],"learners\u2019":[35],"opportunities":[36],"significant":[38,134],"improvement.":[39],"This":[40],"paper":[41],"presents":[42],"a":[43,48,80,84,133],"key":[44],"advance":[45],"toward":[46,136],"developing":[47],"CAPT":[49],"system":[50,124],"that":[51],"offers":[52],"articulatory":[57,65,74,105,128],"movements":[58,129],"using":[59],"real-time":[60],"magnetic":[61],"resonance":[62],"imaging":[63],"(rtMRI)":[64],"landmarks.":[66],"The":[67],"limited":[68],"availability":[69],"of":[70],"paired":[71],"speech":[72,90],"landmark":[75,114],"data,":[76],"typically":[77],"involving":[78],"only":[79],"few":[81],"speakers,":[82],"poses":[83],"challenge":[85],"generalizing":[87],"across":[88],"diverse":[89],"patterns.":[91],"To":[92],"address":[93],"this,":[94],"we":[95],"propose":[96],"leveraging":[97],"pretrained":[98],"wav2vec":[99],"2.0":[100],"embeddings,":[101],"fine-tuned":[102],"to":[103,108],"generate":[104],"contours":[106],"mapped":[107],"xy":[109],"coordinates":[110],"rtMRI":[113,120],"data.":[115],"As":[116],"evaluated":[117],"with":[118],"the":[119],"USC-TIMIT":[121],"dataset,":[122],"our":[123],"effectively":[125],"reconstructs":[126],"from":[130],"speech,":[131],"marking":[132],"step":[135],"enhanced":[137],"learning.":[140]},"counts_by_year":[],"updated_date":"2025-12-28T23:10:05.387466","created_date":"2025-10-10T00:00:00"}
