{"id":"https://openalex.org/W4392904409","doi":"https://doi.org/10.1109/icassp48485.2024.10446062","title":"SD-HuBERT: Sentence-Level Self-Distillation Induces Syllabic Organization in Hubert","display_name":"SD-HuBERT: Sentence-Level Self-Distillation Induces Syllabic Organization in Hubert","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392904409","doi":"https://doi.org/10.1109/icassp48485.2024.10446062"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446062","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446062","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5015390421","display_name":"Cheol Jun Cho","orcid":null},"institutions":[{"id":"https://openalex.org/I134446601","display_name":"Berkeley College","ror":"https://ror.org/02xewxa75","country_code":"US","type":"education","lineage":["https://openalex.org/I134446601"]},{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Cheol Jun Cho","raw_affiliation_strings":["UC Berkeley"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"UC Berkeley","institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103742478","display_name":"Abdelrahman Mohamed","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abdelrahman Mohamed","raw_affiliation_strings":["Rembrand"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Rembrand","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029566548","display_name":"Shang-Wen Li","orcid":"https://orcid.org/0000-0003-0656-9874"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shang-Wen Li","raw_affiliation_strings":["Meta AI"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107337645","display_name":"Alan W. Black","orcid":"https://orcid.org/0000-0001-8820-8831"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Alan W Black","raw_affiliation_strings":["Carnegie Mellon University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5068922218","display_name":"Gopala K. Anumanchipalli","orcid":"https://orcid.org/0000-0002-9714-7740"},"institutions":[{"id":"https://openalex.org/I134446601","display_name":"Berkeley College","ror":"https://ror.org/02xewxa75","country_code":"US","type":"education","lineage":["https://openalex.org/I134446601"]},{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Gopala K. Anumanchipalli","raw_affiliation_strings":["UC Berkeley"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"UC Berkeley","institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5015390421"],"corresponding_institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"],"apc_list":null,"apc_paid":null,"fwci":1.3171,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.82374653,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"12076","last_page":"12080"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7835294008255005},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.7023731470108032},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.700995922088623},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6053874492645264},{"id":"https://openalex.org/keywords/syllable","display_name":"Syllable","score":0.5652811527252197},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5217053294181824},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5051010251045227},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.49852705001831055},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.49715736508369446},{"id":"https://openalex.org/keywords/syllabic-verse","display_name":"Syllabic verse","score":0.4569677412509918},{"id":"https://openalex.org/keywords/spoken-language","display_name":"Spoken language","score":0.43829184770584106}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7835294008255005},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.7023731470108032},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.700995922088623},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6053874492645264},{"id":"https://openalex.org/C109089402","wikidata":"https://www.wikidata.org/wiki/Q8188","display_name":"Syllable","level":2,"score":0.5652811527252197},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5217053294181824},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5051010251045227},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.49852705001831055},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.49715736508369446},{"id":"https://openalex.org/C194051139","wikidata":"https://www.wikidata.org/wiki/Q2629608","display_name":"Syllabic verse","level":2,"score":0.4569677412509918},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.43829184770584106},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446062","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10446062","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5268838972","display_name":"Collaborative Research: RI: Medium: Flexible Deep Speech Synthesis through Gestural Modeling","funder_award_id":"2106928","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1778492285","https://openalex.org/W2114347655","https://openalex.org/W2250539671","https://openalex.org/W2462305634","https://openalex.org/W2747874407","https://openalex.org/W2908510526","https://openalex.org/W2936774411","https://openalex.org/W2963720603","https://openalex.org/W2963902314","https://openalex.org/W2964169922","https://openalex.org/W2995680346","https://openalex.org/W3036601975","https://openalex.org/W3095361818","https://openalex.org/W3140429000","https://openalex.org/W3156636935","https://openalex.org/W3159481202","https://openalex.org/W3197580070","https://openalex.org/W3198217962","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W3213018012","https://openalex.org/W4224875474","https://openalex.org/W4281492411","https://openalex.org/W4313679638","https://openalex.org/W4372348980","https://openalex.org/W4375868953","https://openalex.org/W4381786045","https://openalex.org/W4385823003","https://openalex.org/W4385823277","https://openalex.org/W4394671563","https://openalex.org/W6757817989","https://openalex.org/W6770596778","https://openalex.org/W6780218876","https://openalex.org/W6790356757","https://openalex.org/W6803675045","https://openalex.org/W6848735303"],"related_works":["https://openalex.org/W2254467971","https://openalex.org/W2035036138","https://openalex.org/W3122402893","https://openalex.org/W2364360542","https://openalex.org/W2385848593","https://openalex.org/W57477431","https://openalex.org/W2365880448","https://openalex.org/W206077614","https://openalex.org/W2012074517","https://openalex.org/W2621806188"],"abstract_inverted_index":{"Data-driven":[0],"unit":[1],"discovery":[2,131],"in":[3,25,44,79,127],"self-supervised":[4],"learning":[5,45,133],"(SSL)":[6],"of":[7,15,48,117,142],"speech":[8,29],"has":[9],"embarked":[10],"on":[11,151],"a":[12,40,106],"new":[13,107],"era":[14],"spoken":[16,163],"language":[17,164],"processing.":[18],"Yet,":[19],"the":[20,58,67,73,82,99,140],"discovered":[21],"units":[22,30,161],"often":[23],"remain":[24],"phonetic":[26],"space":[27],"and":[28,81,132,156],"beyond":[31],"phonemes":[32],"are":[33],"largely":[34,96],"underexplored.":[35],"Here,":[36],"we":[37,52,104,137],"demonstrate":[38,91,138],"that":[39,65,92,139],"syllabic":[41,88,147],"organization":[42,148],"emerges":[43],"sentence-level":[46,115,134],"representation":[47,116],"speech.":[49,118],"In":[50],"particular,":[51],"adopt":[53],"\"self-distillation\"":[54],"objective":[55],"to":[56,98,121,146],"fine-tune":[57],"pretrained":[59],"HuBERT":[60,143],"with":[61],"an":[62],"aggregator":[63],"token":[64],"summarizes":[66],"entire":[68],"sentence.":[69],"Without":[70],"any":[71],"supervision,":[72],"resulting":[74],"model":[75,125],"draws":[76],"definite":[77],"boundaries":[78],"speech,":[80],"representations":[83],"across":[84],"frames":[85],"exhibit":[86],"salient":[87],"structures.":[89],"We":[90],"this":[93],"emergent":[94],"structure":[95],"corresponds":[97],"ground":[100],"truth":[101],"syllables.":[102],"Furthermore,":[103],"propose":[105],"benchmark":[108],"task,":[109],"Spoken":[110],"Speech":[111],"ABX,":[112],"for":[113,162],"evaluating":[114],"When":[119],"compared":[120],"previous":[122],"models,":[123],"our":[124],"outperforms":[126],"both":[128],"unsupervised":[129],"syllable":[130],"representation.":[135],"Together,":[136],"self-distillation":[141],"gives":[144],"rise":[145],"without":[149],"relying":[150],"external":[152],"labels":[153],"or":[154],"modalities,":[155],"potentially":[157],"provides":[158],"novel":[159],"data-driven":[160],"modeling.":[165]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":2}],"updated_date":"2026-04-28T14:05:53.105641","created_date":"2025-10-10T00:00:00"}
