{"id":"https://openalex.org/W4392902939","doi":"https://doi.org/10.1109/icassp48485.2024.10447345","title":"Self-Supervised Models of Speech Infer Universal Articulatory Kinematics","display_name":"Self-Supervised Models of Speech Infer Universal Articulatory Kinematics","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392902939","doi":"https://doi.org/10.1109/icassp48485.2024.10447345"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10447345","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp48485.2024.10447345","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5015390421","display_name":"Cheol Jun Cho","orcid":null},"institutions":[{"id":"https://openalex.org/I134446601","display_name":"Berkeley College","ror":"https://ror.org/02xewxa75","country_code":"US","type":"education","lineage":["https://openalex.org/I134446601"]},{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Cheol Jun Cho","raw_affiliation_strings":["UC Berkeley"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"UC Berkeley","institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103742478","display_name":"Abdelrahman Mohamed","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abdelrahman Mohamed","raw_affiliation_strings":["Rembrand"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Rembrand","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107337645","display_name":"Alan W. Black","orcid":"https://orcid.org/0000-0001-8820-8831"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Alan W Black","raw_affiliation_strings":["Carnegie Mellon University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5068922218","display_name":"Gopala K. Anumanchipalli","orcid":"https://orcid.org/0000-0002-9714-7740"},"institutions":[{"id":"https://openalex.org/I134446601","display_name":"Berkeley College","ror":"https://ror.org/02xewxa75","country_code":"US","type":"education","lineage":["https://openalex.org/I134446601"]},{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Gopala K. Anumanchipalli","raw_affiliation_strings":["UC Berkeley"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"UC Berkeley","institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5015390421"],"corresponding_institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"],"apc_list":null,"apc_paid":null,"fwci":0.9878,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.78280472,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":96},"biblio":{"volume":"33","issue":null,"first_page":"12061","last_page":"12065"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7713530659675598},{"id":"https://openalex.org/keywords/property","display_name":"Property (philosophy)","score":0.5636915564537048},{"id":"https://openalex.org/keywords/generalizability-theory","display_name":"Generalizability theory","score":0.5392023921012878},{"id":"https://openalex.org/keywords/abstraction","display_name":"Abstraction","score":0.5178347229957581},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5117650032043457},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5036258101463318},{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.47314655780792236},{"id":"https://openalex.org/keywords/kinematics","display_name":"Kinematics","score":0.4684472382068634},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4658445119857788},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4538222551345825},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.4501414895057678},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4254961311817169},{"id":"https://openalex.org/keywords/inversion","display_name":"Inversion (geology)","score":0.4158073663711548},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.11043104529380798}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7713530659675598},{"id":"https://openalex.org/C189950617","wikidata":"https://www.wikidata.org/wiki/Q937228","display_name":"Property (philosophy)","level":2,"score":0.5636915564537048},{"id":"https://openalex.org/C27158222","wikidata":"https://www.wikidata.org/wiki/Q5532422","display_name":"Generalizability theory","level":2,"score":0.5392023921012878},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.5178347229957581},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5117650032043457},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5036258101463318},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.47314655780792236},{"id":"https://openalex.org/C39920418","wikidata":"https://www.wikidata.org/wiki/Q11476","display_name":"Kinematics","level":2,"score":0.4684472382068634},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4658445119857788},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4538222551345825},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.4501414895057678},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4254961311817169},{"id":"https://openalex.org/C1893757","wikidata":"https://www.wikidata.org/wiki/Q3653001","display_name":"Inversion (geology)","level":3,"score":0.4158073663711548},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.11043104529380798},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C74650414","wikidata":"https://www.wikidata.org/wiki/Q11397","display_name":"Classical mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C109007969","wikidata":"https://www.wikidata.org/wiki/Q749565","display_name":"Structural basin","level":2,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10447345","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp48485.2024.10447345","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5268838972","display_name":"Collaborative Research: RI: Medium: Flexible Deep Speech Synthesis through Gestural Modeling","funder_award_id":"2106928","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W2046681189","https://openalex.org/W2068447135","https://openalex.org/W2159693968","https://openalex.org/W2231075402","https://openalex.org/W2407043376","https://openalex.org/W2622158094","https://openalex.org/W2943899479","https://openalex.org/W3036601975","https://openalex.org/W3121914243","https://openalex.org/W3133449202","https://openalex.org/W3197580070","https://openalex.org/W3209059054","https://openalex.org/W3213029956","https://openalex.org/W4223430326","https://openalex.org/W4226380987","https://openalex.org/W4297841848","https://openalex.org/W4372340876","https://openalex.org/W4372348980","https://openalex.org/W4375868953","https://openalex.org/W4375869259","https://openalex.org/W4378105483","https://openalex.org/W4385823003","https://openalex.org/W4388034436","https://openalex.org/W6780218876","https://openalex.org/W6788328058","https://openalex.org/W6852909395"],"related_works":["https://openalex.org/W2029561777","https://openalex.org/W172797710","https://openalex.org/W3165080709","https://openalex.org/W2945105049","https://openalex.org/W2626699140","https://openalex.org/W2909357361","https://openalex.org/W4387098302","https://openalex.org/W4288365855","https://openalex.org/W2948317131","https://openalex.org/W2111961547"],"abstract_inverted_index":{"Self-Supervised":[0],"Learning":[1],"(SSL)":[2],"based":[3],"models":[4,19,30,63,143,159],"of":[5,14,41,49,55,61,88,129,141],"speech":[6,74,161,169],"have":[7,20,27],"shown":[8],"remarkable":[9],"performance":[10],"on":[11,138],"a":[12],"range":[13],"downstream":[15],"tasks.":[16],"These":[17],"state-of-the-art":[18],"remained":[21],"blackboxes,":[22],"but":[23],"many":[24],"recent":[25],"studies":[26],"begun":[28],"\u201cprobing\u201d":[29],"like":[31],"HuBERT,":[32],"to":[33,38,64,92,98,147],"correlate":[34],"their":[35,148],"internal":[36],"representations":[37],"different":[39],"aspects":[40],"speech.":[42],"In":[43],"this":[44,80,130],"paper,":[45],"we":[46,106],"show":[47,78,107],"\u201cinference":[48],"articulatory":[50,70],"kinematics\u201d":[51],"as":[52],"fundamental":[53],"property":[54],"SSL":[56,142],"models,":[57],"i.e.,":[58],"the":[59,68,73,86,89,94,99,127,139],"ability":[60],"these":[62,133],"transform":[65],"acoustics":[66],"into":[67,156],"causal":[69],"dynamics":[71],"underlying":[72],"signal.":[75],"We":[76],"also":[77],"that":[79,108,144,163],"abstraction":[81],"is":[82,116],"largely":[83],"overlapping":[84],"across":[85,118,121],"language":[87,100],"data":[90],"used":[91],"train":[93],"model,":[95],"with":[96,101,109],"preference":[97],"similar":[102],"phonological":[103],"system.":[104],"Furthermore,":[105],"simple":[110],"affine":[111],"transformations,":[112],"Acoustic-to-Articulatory":[113],"inversion":[114],"(AAI)":[115],"transferrable":[117],"speakers,":[119],"even":[120],"genders,":[122],"languages,":[123],"and":[124,151,166],"dialects,":[125],"showing":[126],"generalizability":[128],"property.":[131],"Together,":[132],"results":[134],"shed":[135],"new":[136,154],"light":[137],"internals":[140],"are":[145,164],"critical":[146],"superior":[149],"performance,":[150],"open":[152],"up":[153],"avenues":[155],"language-agnostic":[157],"universal":[158],"for":[160],"engineering,":[162],"interpretable":[165],"grounded":[167],"in":[168],"science.":[170]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":2}],"updated_date":"2026-04-28T14:05:53.105641","created_date":"2025-10-10T00:00:00"}
