{"id":"https://openalex.org/W4372348980","doi":"https://doi.org/10.1109/icassp49357.2023.10094711","title":"Evidence of Vocal Tract Articulation in Self-Supervised Learning of Speech","display_name":"Evidence of Vocal Tract Articulation in Self-Supervised Learning of Speech","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372348980","doi":"https://doi.org/10.1109/icassp49357.2023.10094711"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10094711","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10094711","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5015390421","display_name":"Cheol Jun Cho","orcid":null},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]},{"id":"https://openalex.org/I134446601","display_name":"Berkeley College","ror":"https://ror.org/02xewxa75","country_code":"US","type":"education","lineage":["https://openalex.org/I134446601"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Cheol Jun Cho","raw_affiliation_strings":["UC Berkeley,EECS,CA","EECS, UC Berkeley, CA"],"affiliations":[{"raw_affiliation_string":"UC Berkeley,EECS,CA","institution_ids":["https://openalex.org/I134446601"]},{"raw_affiliation_string":"EECS, UC Berkeley, CA","institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081837203","display_name":"Peter Wu","orcid":"https://orcid.org/0000-0001-6503-3936"},"institutions":[{"id":"https://openalex.org/I134446601","display_name":"Berkeley College","ror":"https://ror.org/02xewxa75","country_code":"US","type":"education","lineage":["https://openalex.org/I134446601"]},{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Peter Wu","raw_affiliation_strings":["UC Berkeley,EECS,CA","EECS, UC Berkeley, CA"],"affiliations":[{"raw_affiliation_string":"UC Berkeley,EECS,CA","institution_ids":["https://openalex.org/I134446601"]},{"raw_affiliation_string":"EECS, UC Berkeley, CA","institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103742478","display_name":"Abdelrahman Mohamed","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abdelrahman Mohamed","raw_affiliation_strings":["Meta AI"],"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5068922218","display_name":"Gopala K. Anumanchipalli","orcid":"https://orcid.org/0000-0002-9714-7740"},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]},{"id":"https://openalex.org/I134446601","display_name":"Berkeley College","ror":"https://ror.org/02xewxa75","country_code":"US","type":"education","lineage":["https://openalex.org/I134446601"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Gopala K. Anumanchipalli","raw_affiliation_strings":["UC Berkeley,EECS,CA","EECS, UC Berkeley, CA"],"affiliations":[{"raw_affiliation_string":"UC Berkeley,EECS,CA","institution_ids":["https://openalex.org/I134446601"]},{"raw_affiliation_string":"EECS, UC Berkeley, CA","institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5015390421"],"corresponding_institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"],"apc_list":null,"apc_paid":null,"fwci":2.5579,"has_fulltext":false,"cited_by_count":15,"citation_normalized_percentile":{"value":0.91364335,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.9955000281333923,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9955000281333923,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/vocal-tract","display_name":"Vocal tract","score":0.8984866142272949},{"id":"https://openalex.org/keywords/articulation","display_name":"Articulation (sociology)","score":0.8265004754066467},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6731268167495728},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6081789135932922},{"id":"https://openalex.org/keywords/manner-of-articulation","display_name":"Manner of articulation","score":0.4685080349445343},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4480052590370178},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.391070693731308}],"concepts":[{"id":"https://openalex.org/C47401133","wikidata":"https://www.wikidata.org/wiki/Q748953","display_name":"Vocal tract","level":2,"score":0.8984866142272949},{"id":"https://openalex.org/C2779337067","wikidata":"https://www.wikidata.org/wiki/Q4800961","display_name":"Articulation (sociology)","level":3,"score":0.8265004754066467},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6731268167495728},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6081789135932922},{"id":"https://openalex.org/C66029223","wikidata":"https://www.wikidata.org/wiki/Q210847","display_name":"Manner of articulation","level":2,"score":0.4685080349445343},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4480052590370178},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.391070693731308},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10094711","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10094711","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.6899999976158142,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2231075402","https://openalex.org/W2804300206","https://openalex.org/W2973049979","https://openalex.org/W2979476256","https://openalex.org/W2995181338","https://openalex.org/W3036601975","https://openalex.org/W3041561163","https://openalex.org/W3112034174","https://openalex.org/W3118578889","https://openalex.org/W3121914243","https://openalex.org/W3163596720","https://openalex.org/W3197580070","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4221145109","https://openalex.org/W4225096077","https://openalex.org/W4226380987","https://openalex.org/W4281492411","https://openalex.org/W4287173589","https://openalex.org/W4297841848","https://openalex.org/W6769196770","https://openalex.org/W6780218876","https://openalex.org/W6784050962","https://openalex.org/W6786669483","https://openalex.org/W6788328058","https://openalex.org/W6795952400","https://openalex.org/W6810007534"],"related_works":["https://openalex.org/W2022221551","https://openalex.org/W2067459736","https://openalex.org/W2105635394","https://openalex.org/W1552694876","https://openalex.org/W1037709911","https://openalex.org/W2063579271","https://openalex.org/W2891508361","https://openalex.org/W2351908579","https://openalex.org/W2748448327","https://openalex.org/W2066914524"],"abstract_inverted_index":{"Recent":[0],"self-supervised":[1],"learning":[2],"(SSL)":[3],"models":[4,34,123,156,187],"have":[5,28],"proven":[6],"to":[7,35,82,86,115,172,189],"learn":[8,188],"rich":[9],"representations":[10,85,150],"of":[11,50,112,121,128],"speech,":[12],"which":[13,37],"can":[14],"readily":[15],"be":[16],"utilized":[17],"by":[18,64,90],"diverse":[19],"downstream":[20],"tasks.":[21],"To":[22,73],"understand":[23],"such":[24],"utilities,":[25],"various":[26],"analyses":[27,52,137],"been":[29],"done":[30],"for":[31],"speech":[32,65,84,154,201],"SSL":[33,122,155,186],"reveal":[36],"and":[38,58,133,146,166,195],"how":[39],"information":[40],"is":[41,53,96],"encoded":[42],"in":[43,55],"the":[44,48,61,126,129,152],"learned":[45],"representations.":[46],"Although":[47],"scope":[49],"previous":[51],"extensive":[54],"acoustic,":[56],"phonetic,":[57],"semantic":[59],"perspectives,":[60],"physical":[62],"grounding":[63],"production":[66],"has":[67],"not":[68],"yet":[69],"received":[70],"full":[71],"attention.":[72],"bridge":[74],"this":[75],"gap,":[76],"we":[77,104],"conduct":[78],"a":[79,99,119,174,197],"comprehensive":[80],"analysis":[81,95],"link":[83],"articulatory":[87,106],"trajectories":[88],"measured":[89],"electromagnetic":[91],"articulography":[92],"(EMA).":[93],"Our":[94,182],"based":[97],"on":[98,138],"linear":[100,113,175],"probing":[101],"approach":[102],"where":[103],"measure":[105],"score":[107],"as":[108],"an":[109],"average":[110],"correlation":[111],"mapping":[114],"EMA.":[116],"We":[117],"analyze":[118],"set":[120],"selected":[124],"from":[125,151],"leaderboard":[127],"SUPERB":[130],"benchmark":[131],"[1]":[132],"perform":[134],"further":[135],"layer-wise":[136],"two":[139],"most":[140],"successful":[141],"models,":[142],"Wav2Vec":[143],"2.0":[144],"[2]":[145],"HuBERT":[147],"[3].":[148],"Surprisingly,":[149],"recent":[153],"are":[157,170],"highly":[158],"correlated":[159],"with":[160,177,192],"EMA":[161],"traces":[162],"(best:":[163],"r":[164],"=0.81),":[165],"only":[167],"5":[168],"minutes":[169],"sufficient":[171],"train":[173],"model":[176],"high":[178],"performance":[179],"(r":[180],"=0.77).":[181],"findings":[183],"suggest":[184],"that":[185],"align":[190],"closely":[191],"continuous":[193],"articulations,":[194],"provide":[196],"novel":[198],"insight":[199],"into":[200],"SSL.":[202]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":1}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
