{"id":"https://openalex.org/W4402112164","doi":"https://doi.org/10.21437/interspeech.2024-1095","title":"Self-Supervised Models for Phoneme Recognition: Applications in Children's Speech for Reading Learning","display_name":"Self-Supervised Models for Phoneme Recognition: Applications in Children's Speech for Reading Learning","publication_year":2024,"publication_date":"2024-09-01","ids":{"openalex":"https://openalex.org/W4402112164","doi":"https://doi.org/10.21437/interspeech.2024-1095"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2024-1095","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2024-1095","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2024","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2503.04710","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5099469292","display_name":"Lucas Block Medin","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Lucas Block Medin","raw_affiliation_strings":["Lalilo [Paris] (236 Rue du Faubourg Saint-Martin, 75010 Paris - France)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Lalilo [Paris] (236 Rue du Faubourg Saint-Martin, 75010 Paris - France)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088037163","display_name":"Thomas Pellegrini","orcid":"https://orcid.org/0000-0001-8984-1399"},"institutions":[{"id":"https://openalex.org/I3131550300","display_name":"Universit\u00e9 Toulouse-I-Capitole","ror":"https://ror.org/0443n9e75","country_code":"FR","type":"education","lineage":["https://openalex.org/I3131550300"]},{"id":"https://openalex.org/I4210160189","display_name":"Institut Polytechnique de Bordeaux","ror":"https://ror.org/054qv7y42","country_code":"FR","type":"education","lineage":["https://openalex.org/I4210160189"]},{"id":"https://openalex.org/I4210152422","display_name":"Universit\u00e9 Toulouse - Jean Jaur\u00e8s","ror":"https://ror.org/04ezk3x31","country_code":"FR","type":"education","lineage":["https://openalex.org/I4210152422","https://openalex.org/I4405258862"]},{"id":"https://openalex.org/I4210119061","display_name":"Institut de Recherche en Informatique de Toulouse","ror":"https://ror.org/01rx4qw44","country_code":"FR","type":"facility","lineage":["https://openalex.org/I1294671590","https://openalex.org/I205747304","https://openalex.org/I205747304","https://openalex.org/I4210119061","https://openalex.org/I4387153255","https://openalex.org/I4405258862","https://openalex.org/I4405258862"]},{"id":"https://openalex.org/I134560555","display_name":"Universit\u00e9 Toulouse III - Paul Sabatier","ror":"https://ror.org/02v6kpv12","country_code":"FR","type":"education","lineage":["https://openalex.org/I134560555","https://openalex.org/I4405258862"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Thomas Pellegrini","raw_affiliation_strings":["IRIT-SAMoVA - \u00c9quipe Structuration, Analyse et MOd\u00e9lisation de documents Vid\u00e9o et Audio (IRIT\r\n118 Route de Narbonne\r\n31062 Toulouse Cedex 9 - France)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IRIT-SAMoVA - \u00c9quipe Structuration, Analyse et MOd\u00e9lisation de documents Vid\u00e9o et Audio (IRIT\r\n118 Route de Narbonne\r\n31062 Toulouse Cedex 9 - France)","institution_ids":["https://openalex.org/I4210152422","https://openalex.org/I134560555","https://openalex.org/I4210119061","https://openalex.org/I3131550300","https://openalex.org/I4210160189"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5064453226","display_name":"Lucile Gelin","orcid":"https://orcid.org/0000-0002-5623-9438"},"institutions":[{"id":"https://openalex.org/I4210119061","display_name":"Institut de Recherche en Informatique de Toulouse","ror":"https://ror.org/01rx4qw44","country_code":"FR","type":"facility","lineage":["https://openalex.org/I1294671590","https://openalex.org/I205747304","https://openalex.org/I205747304","https://openalex.org/I4210119061","https://openalex.org/I4387153255","https://openalex.org/I4405258862","https://openalex.org/I4405258862"]},{"id":"https://openalex.org/I3131550300","display_name":"Universit\u00e9 Toulouse-I-Capitole","ror":"https://ror.org/0443n9e75","country_code":"FR","type":"education","lineage":["https://openalex.org/I3131550300"]},{"id":"https://openalex.org/I4210160189","display_name":"Institut Polytechnique de Bordeaux","ror":"https://ror.org/054qv7y42","country_code":"FR","type":"education","lineage":["https://openalex.org/I4210160189"]},{"id":"https://openalex.org/I134560555","display_name":"Universit\u00e9 Toulouse III - Paul Sabatier","ror":"https://ror.org/02v6kpv12","country_code":"FR","type":"education","lineage":["https://openalex.org/I134560555","https://openalex.org/I4405258862"]},{"id":"https://openalex.org/I4210152422","display_name":"Universit\u00e9 Toulouse - Jean Jaur\u00e8s","ror":"https://ror.org/04ezk3x31","country_code":"FR","type":"education","lineage":["https://openalex.org/I4210152422","https://openalex.org/I4405258862"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Lucile Gelin","raw_affiliation_strings":["Lalilo [Paris] (236 Rue du Faubourg Saint-Martin, 75010 Paris - France)","IRIT-SAMoVA - \u00c9quipe Structuration, Analyse et MOd\u00e9lisation de documents Vid\u00e9o et Audio (IRIT\r\n118 Route de Narbonne\r\n31062 Toulouse Cedex 9 - France)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Lalilo [Paris] (236 Rue du Faubourg Saint-Martin, 75010 Paris - France)","institution_ids":[]},{"raw_affiliation_string":"IRIT-SAMoVA - \u00c9quipe Structuration, Analyse et MOd\u00e9lisation de documents Vid\u00e9o et Audio (IRIT\r\n118 Route de Narbonne\r\n31062 Toulouse Cedex 9 - France)","institution_ids":["https://openalex.org/I4210152422","https://openalex.org/I134560555","https://openalex.org/I4210119061","https://openalex.org/I3131550300","https://openalex.org/I4210160189"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5099469292"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.296,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":{"value":0.83508419,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"5168","last_page":"5172"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9861000180244446,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8001688718795776},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.7296004891395569},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6814247965812683},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.513368546962738},{"id":"https://openalex.org/keywords/reading","display_name":"Reading (process)","score":0.4606281816959381},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.45705080032348633},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.4146215319633484},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.38588711619377136},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.07620564103126526},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.0757666528224945}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8001688718795776},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7296004891395569},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6814247965812683},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.513368546962738},{"id":"https://openalex.org/C554936623","wikidata":"https://www.wikidata.org/wiki/Q199657","display_name":"Reading (process)","level":2,"score":0.4606281816959381},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45705080032348633},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.4146215319633484},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.38588711619377136},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.07620564103126526},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0757666528224945},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.21437/interspeech.2024-1095","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2024-1095","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2024","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2503.04710","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.04710","pdf_url":"https://arxiv.org/pdf/2503.04710","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:HAL:hal-04694927v1","is_oa":true,"landing_page_url":"https://hal.science/hal-04694927","pdf_url":"https://hal.science/hal-04694927/document","source":{"id":"https://openalex.org/S4306402512","display_name":"HAL (Le Centre pour la Communication Scientifique Directe)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1294671590","host_organization_name":"Centre National de la Recherche Scientifique","host_organization_lineage":["https://openalex.org/I1294671590"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"https://www.isca-archive.org/interspeech_2024/blockmedin24_interspeech.html","raw_type":"Conference papers"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2503.04710","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.04710","pdf_url":"https://arxiv.org/pdf/2503.04710","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.8399999737739563}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2082438799","https://openalex.org/W1966986837","https://openalex.org/W2360138227","https://openalex.org/W4365808155","https://openalex.org/W1838455177","https://openalex.org/W1489909378","https://openalex.org/W2392697679","https://openalex.org/W2385990477","https://openalex.org/W2331964906","https://openalex.org/W2067734110"],"abstract_inverted_index":{"Child":[0],"speech":[1,33,139],"recognition":[2,34,58],"is":[3,127],"still":[4],"an":[5],"underdeveloped":[6],"area":[7],"of":[8,14,24,70,111,119],"research":[9],"due":[10],"to":[11,56,130],"the":[12,21,68,109,116],"lack":[13],"data":[15],"(especially":[16],"on":[17,86],"non-English":[18],"languages)":[19],"and":[20,52,63,94,122,134],"specific":[22],"difficulties":[23],"this":[25,39],"task.":[26],"Having":[27],"explored":[28],"various":[29,131],"architectures":[30],"for":[31],"child":[32,61,87,141],"in":[35,38,59,107],"previous":[36],"work,":[37],"article":[40],"we":[41,105],"tackle":[42],"recent":[43],"self-supervised":[44,143],"models.":[45],"We":[46,74],"first":[47],"compare":[48],"wav2vec":[49],"2.0,":[50],"HuBERT":[51],"WavLM":[53,72,125],"models":[54,114],"adapted":[55],"phoneme":[57],"French":[60],"speech,":[62,88,142],"continue":[64],"our":[65,99,120],"experiments":[66],"with":[67],"best":[69],"them,":[71],"base+.":[73],"then":[75],"further":[76],"adapt":[77],"it":[78,96],"by":[79],"unfreezing":[80],"its":[81,92],"transformer":[82],"blocks":[83],"during":[84],"fine-tuning":[85],"which":[89],"greatly":[90],"improves":[91],"performance":[93],"makes":[95],"significantly":[97],"outperform":[98],"base":[100],"model,":[101],"a":[102],"Transformer+CTC.":[103],"Finally,":[104],"study":[106],"detail":[108],"behaviour":[110],"these":[112],"two":[113],"under":[115],"real":[117],"conditions":[118],"application,":[121],"show":[123],"that":[124],"base+":[126],"more":[128],"robust":[129],"reading":[132],"tasks":[133],"noise":[135],"levels.":[136],"Index":[137],"Terms:":[138],"recognition,":[140],"learning":[144]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3}],"updated_date":"2026-05-07T13:39:58.223016","created_date":"2025-10-10T00:00:00"}
