{"id":"https://openalex.org/W4416756381","doi":"https://doi.org/10.1109/sped67700.2025.11252189","title":"Pretrained Speech Models Learn Boundaries, Not Patterns: An Analysis of Supervised vs. Unsupervised Capabilities","display_name":"Pretrained Speech Models Learn Boundaries, Not Patterns: An Analysis of Supervised vs. Unsupervised Capabilities","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4416756381","doi":"https://doi.org/10.1109/sped67700.2025.11252189"},"language":null,"primary_location":{"id":"doi:10.1109/sped67700.2025.11252189","is_oa":false,"landing_page_url":"https://doi.org/10.1109/sped67700.2025.11252189","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Speech Technology and Human-Computer Dialogue (SpeD)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024805800","display_name":"Abdul Rehman","orcid":"https://orcid.org/0000-0002-9343-7652"},"institutions":[{"id":"https://openalex.org/I9300472","display_name":"Bournemouth University","ror":"https://ror.org/05wwcw481","country_code":"GB","type":"education","lineage":["https://openalex.org/I9300472"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Abdul Rehman","raw_affiliation_strings":["Bournemouth University,Faculty of Media, Science and Technology,Bournemouth,United Kingdom"],"affiliations":[{"raw_affiliation_string":"Bournemouth University,Faculty of Media, Science and Technology,Bournemouth,United Kingdom","institution_ids":["https://openalex.org/I9300472"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037813153","display_name":"Kavisha Jayathunge","orcid":null},"institutions":[{"id":"https://openalex.org/I9300472","display_name":"Bournemouth University","ror":"https://ror.org/05wwcw481","country_code":"GB","type":"education","lineage":["https://openalex.org/I9300472"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Kavisha Jayathunge","raw_affiliation_strings":["Bournemouth University,Faculty of Media, Science and Technology,Bournemouth,United Kingdom"],"affiliations":[{"raw_affiliation_string":"Bournemouth University,Faculty of Media, Science and Technology,Bournemouth,United Kingdom","institution_ids":["https://openalex.org/I9300472"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100358176","display_name":"Jianjun Zhang","orcid":"https://orcid.org/0000-0001-7172-0855"},"institutions":[{"id":"https://openalex.org/I9300472","display_name":"Bournemouth University","ror":"https://ror.org/05wwcw481","country_code":"GB","type":"education","lineage":["https://openalex.org/I9300472"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Jian-Jun Zhang","raw_affiliation_strings":["Bournemouth University,Faculty of Media, Science and Technology,Bournemouth,United Kingdom"],"affiliations":[{"raw_affiliation_string":"Bournemouth University,Faculty of Media, Science and Technology,Bournemouth,United Kingdom","institution_ids":["https://openalex.org/I9300472"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5049130877","display_name":"Xiao\u2010Song Yang","orcid":"https://orcid.org/0000-0002-4789-8335"},"institutions":[{"id":"https://openalex.org/I9300472","display_name":"Bournemouth University","ror":"https://ror.org/05wwcw481","country_code":"GB","type":"education","lineage":["https://openalex.org/I9300472"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Xiaosong Yang","raw_affiliation_strings":["Bournemouth University,Faculty of Media, Science and Technology,Bournemouth,United Kingdom"],"affiliations":[{"raw_affiliation_string":"Bournemouth University,Faculty of Media, Science and Technology,Bournemouth,United Kingdom","institution_ids":["https://openalex.org/I9300472"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5024805800"],"corresponding_institution_ids":["https://openalex.org/I9300472"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.20586938,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"114","last_page":"119"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.28940001130104065,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.28940001130104065,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.16040000319480896,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.09109999984502792,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/orthogonality","display_name":"Orthogonality","score":0.7150999903678894},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.6919000148773193},{"id":"https://openalex.org/keywords/stress","display_name":"Stress (linguistics)","score":0.6636000275611877},{"id":"https://openalex.org/keywords/value","display_name":"Value (mathematics)","score":0.34790000319480896},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3449999988079071},{"id":"https://openalex.org/keywords/term","display_name":"Term (time)","score":0.3440999984741211},{"id":"https://openalex.org/keywords/core","display_name":"Core (optical fiber)","score":0.33390000462532043},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.32359999418258667}],"concepts":[{"id":"https://openalex.org/C17137986","wikidata":"https://www.wikidata.org/wiki/Q215067","display_name":"Orthogonality","level":2,"score":0.7150999903678894},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.6919000148773193},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6800000071525574},{"id":"https://openalex.org/C2776756274","wikidata":"https://www.wikidata.org/wiki/Q181767","display_name":"Stress (linguistics)","level":2,"score":0.6636000275611877},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5575000047683716},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5307999849319458},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3871999979019165},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3637999892234802},{"id":"https://openalex.org/C2776291640","wikidata":"https://www.wikidata.org/wiki/Q2912517","display_name":"Value (mathematics)","level":2,"score":0.34790000319480896},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3449999988079071},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.3440999984741211},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.33390000462532043},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.32359999418258667},{"id":"https://openalex.org/C84525736","wikidata":"https://www.wikidata.org/wiki/Q831366","display_name":"Decision tree","level":2,"score":0.3206000030040741},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.29339998960494995},{"id":"https://openalex.org/C148043351","wikidata":"https://www.wikidata.org/wiki/Q4456944","display_name":"Current (fluid)","level":2,"score":0.26750001311302185},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.2669999897480011},{"id":"https://openalex.org/C92835128","wikidata":"https://www.wikidata.org/wiki/Q1277447","display_name":"Hierarchical clustering","level":3,"score":0.2637999951839447},{"id":"https://openalex.org/C8038995","wikidata":"https://www.wikidata.org/wiki/Q1152135","display_name":"Unsupervised learning","level":2,"score":0.26019999384880066},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.25189998745918274},{"id":"https://openalex.org/C137584468","wikidata":"https://www.wikidata.org/wiki/Q35395","display_name":"Phonetics","level":2,"score":0.2515999972820282}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/sped67700.2025.11252189","is_oa":false,"landing_page_url":"https://doi.org/10.1109/sped67700.2025.11252189","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Conference on Speech Technology and Human-Computer Dialogue (SpeD)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":8,"referenced_works":["https://openalex.org/W2742542661","https://openalex.org/W3197580070","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4392904752","https://openalex.org/W4394773771","https://openalex.org/W4394862910","https://openalex.org/W4401281176"],"related_works":[],"abstract_inverted_index":{"Do":[0],"pretrained":[1],"speech":[2,6,24,34,112],"models":[3,25,31,50,113],"genuinely":[4],"understand":[5],"patterns,":[7],"or":[8],"do":[9,157],"they":[10,70],"simply":[11],"learn":[12,114],"to":[13,56,77,124],"classify?":[14],"We":[15,29],"investigate":[16],"this":[17],"fundamental":[18],"question":[19],"by":[20],"analyzing":[21],"10":[22],"state-of-the-art":[23],"across":[26],"diverse":[27],"tasks.":[28],"tested":[30],"on":[32],"six":[33],"characteristics":[35],"(gender,":[36],"accent,":[37],"age,":[38],"emotion,":[39],"words,":[40],"speaker":[41],"identity)":[42],"using":[43],"both":[44],"classification":[45,53,96,155],"and":[46],"clustering":[47,73,105],"approaches.":[48],"While":[49],"achieve":[51],"impressive":[52],"accuracy":[54],"(up":[55],"$\\mathbf{1":[57],"0":[58,59],"\\%}$":[60,67],"for":[61,68],"gender,":[62],"$\\mathbf{9":[63],"4":[64],".":[65],"9":[66],"words),":[69],"show":[71],"poor":[72],"performance":[74],"when":[75],"attempting":[76],"discover":[78],"identical":[79],"patterns":[80,103],"without":[81],"supervision.":[82],"Most":[83],"critically,":[84],"we":[85],"find":[86],"systematic":[87],"negative":[88],"correlations":[89],"between":[90],"these":[91],"capabilities\u2014models":[92],"better":[93],"at":[94,100],"accent":[95,102],"are":[97],"actually":[98],"worse":[99],"discovering":[101],"through":[104],"($\\mathrm{r}=-0.904$).":[106],"Our":[107],"analysis":[108],"suggests":[109],"that":[110,164],"current":[111,146],"equilateral":[115],"decision":[116],"boundaries":[117],"rather":[118],"than":[119],"orthogonal":[120],"pattern":[121,161],"representations,":[122],"due":[123],"training":[125],"objectives":[126],"promoting":[127],"uniform":[128],"dimensional":[129],"alignment,":[130],"lacking":[131],"orthogonality":[132],"in":[133,143],"multiple":[134],"dimensions.":[135],"These":[136],"findings":[137],"expose":[138],"a":[139],"critical":[140],"evaluation":[141],"crisis":[142],"self-supervised":[144],"learning:":[145],"benchmarks":[147],"may":[148],"systematically":[149],"overestimate":[150],"model":[151],"capabilities,":[152],"as":[153],"high":[154],"scores":[156],"not":[158],"guarantee":[159],"the":[160],"discovery":[162],"capabilities":[163],"form":[165],"SSL\u2019s":[166],"core":[167],"value":[168],"proposition.":[169]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-27T00:00:00"}
