{"id":"https://openalex.org/W7133565758","doi":"https://doi.org/10.48550/arxiv.2603.03096","title":"Interpreting Speaker Characteristics in the Dimensions of Self-Supervised Speech Features","display_name":"Interpreting Speaker Characteristics in the Dimensions of Self-Supervised Speech Features","publication_year":2026,"publication_date":"2026-03-03","ids":{"openalex":"https://openalex.org/W7133565758","doi":"https://doi.org/10.48550/arxiv.2603.03096"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.03096","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.03096","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.03096","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128044684","display_name":"Kyle Janse van Rensburg","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"van Rensburg, Kyle Janse","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046382731","display_name":"Benjamin van Niekerk","orcid":"https://orcid.org/0000-0001-9207-6309"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"van Niekerk, Benjamin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5040305929","display_name":"Herman Kamper","orcid":"https://orcid.org/0000-0003-2980-3475"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kamper, Herman","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5128044684"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9179999828338623,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9179999828338623,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.018200000748038292,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.011099999770522118,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/dimension","display_name":"Dimension (graph theory)","score":0.6345000267028809},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5501000285148621},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.5284000039100647},{"id":"https://openalex.org/keywords/principal-component-analysis","display_name":"Principal component analysis","score":0.48739999532699585},{"id":"https://openalex.org/keywords/variance","display_name":"Variance (accounting)","score":0.48249998688697815},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.427700012922287},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4092999994754791},{"id":"https://openalex.org/keywords/principal","display_name":"Principal (computer security)","score":0.38960000872612}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6880000233650208},{"id":"https://openalex.org/C33676613","wikidata":"https://www.wikidata.org/wiki/Q13415176","display_name":"Dimension (graph theory)","level":2,"score":0.6345000267028809},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6136000156402588},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5501000285148621},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.5284000039100647},{"id":"https://openalex.org/C27438332","wikidata":"https://www.wikidata.org/wiki/Q2873","display_name":"Principal component analysis","level":2,"score":0.48739999532699585},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.48249998688697815},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4357999861240387},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.427700012922287},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4092999994754791},{"id":"https://openalex.org/C144559511","wikidata":"https://www.wikidata.org/wiki/Q2986279","display_name":"Principal (computer security)","level":2,"score":0.38960000872612},{"id":"https://openalex.org/C70518039","wikidata":"https://www.wikidata.org/wiki/Q16000077","display_name":"Dimensionality reduction","level":2,"score":0.36719998717308044},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.358599990606308},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.34150001406669617},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.336899995803833},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.33000001311302185},{"id":"https://openalex.org/C100675267","wikidata":"https://www.wikidata.org/wiki/Q1371624","display_name":"Background noise","level":2,"score":0.32919999957084656},{"id":"https://openalex.org/C83665646","wikidata":"https://www.wikidata.org/wiki/Q42139305","display_name":"Feature vector","level":2,"score":0.32690000534057617},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.310699999332428},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3091999888420105},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.2919999957084656},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.2606000006198883},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.25839999318122864},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.25459998846054077},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.25429999828338623}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.03096","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.03096","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.03096","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.03096","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.6162038445472717,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"How":[0],"do":[1],"speech":[2,32],"models":[3],"trained":[4],"through":[5],"self-supervised":[6],"learning":[7],"structure":[8],"their":[9],"representations?":[10],"Previous":[11],"studies":[12,28],"have":[13,29],"looked":[14],"at":[15,48],"how":[16],"information":[17,50],"is":[18],"encoded":[19],"in":[20,92,120],"feature":[21],"vectors":[22],"across":[23],"different":[24],"layers.":[25],"But":[26],"few":[27],"considered":[30],"whether":[31],"characteristics":[33,72,99,115],"are":[34],"captured":[35],"within":[36],"individual":[37,76],"dimensions":[38,78],"of":[39,116],"SSL":[40],"features.":[41],"In":[42],"this":[43],"paper":[44],"we":[45,58,95],"specifically":[46],"look":[47],"speaker":[49],"using":[51],"PCA":[52],"on":[53],"utterance-averaged":[54],"representations.":[55],"Using":[56],"WavLM,":[57],"find":[59],"that":[60,64,97],"the":[61,84,105,117],"principal":[62,77],"dimension":[63],"explains":[65],"most":[66,98],"variance":[67],"encodes":[68],"pitch":[69],"and":[70,87],"associated":[71],"like":[73],"gender.":[74],"Other":[75],"correlate":[79],"with":[80],"intensity,":[81],"noise":[82],"levels,":[83],"second":[85],"formant,":[86],"higher":[88],"frequency":[89],"characteristics.":[90],"Finally,":[91],"synthesis":[93,121],"experiments":[94],"show":[96],"can":[100],"be":[101],"controlled":[102],"by":[103],"changing":[104],"corresponding":[106],"dimensions.":[107],"This":[108],"provides":[109],"a":[110],"simple":[111],"method":[112],"to":[113],"control":[114],"output":[118],"voice":[119],"applications.":[122]},"counts_by_year":[],"updated_date":"2026-05-03T08:25:01.440150","created_date":"2026-03-05T00:00:00"}
