{"id":"https://openalex.org/W4401325931","doi":"https://doi.org/10.1109/lsp.2024.3438541","title":"Text-Guided HuBERT: Self-Supervised Speech Pre-Training via Generative Adversarial Networks","display_name":"Text-Guided HuBERT: Self-Supervised Speech Pre-Training via Generative Adversarial Networks","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4401325931","doi":"https://doi.org/10.1109/lsp.2024.3438541"},"language":"en","primary_location":{"id":"doi:10.1109/lsp.2024.3438541","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2024.3438541","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100526775","display_name":"Duo Ma","orcid":"https://orcid.org/0009-0000-5585-6621"},"institutions":[{"id":"https://openalex.org/I4210099586","display_name":"Shenzhen Research Institute of Big Data","ror":"https://ror.org/00z1gwf89","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210099586"]},{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Duo Ma","raw_affiliation_strings":["Shenzhen Research Institute of Big Data, School of Data Science, The Chinese University of Hong Kong, Shenzhen, China"],"raw_orcid":"https://orcid.org/0009-0000-5585-6621","affiliations":[{"raw_affiliation_string":"Shenzhen Research Institute of Big Data, School of Data Science, The Chinese University of Hong Kong, Shenzhen, China","institution_ids":["https://openalex.org/I4210116924","https://openalex.org/I4210099586"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060201191","display_name":"Xianghu Yue","orcid":"https://orcid.org/0000-0003-3527-6034"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Xianghu Yue","raw_affiliation_strings":["Department of Electrical and Computer and Engineering, National University of Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0003-3527-6034","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer and Engineering, National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077069016","display_name":"Junyi Ao","orcid":"https://orcid.org/0000-0001-8979-0835"},"institutions":[{"id":"https://openalex.org/I4210099586","display_name":"Shenzhen Research Institute of Big Data","ror":"https://ror.org/00z1gwf89","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210099586"]},{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junyi Ao","raw_affiliation_strings":["Shenzhen Research Institute of Big Data, School of Data Science, The Chinese University of Hong Kong, Shenzhen, China"],"raw_orcid":"https://orcid.org/0000-0001-8979-0835","affiliations":[{"raw_affiliation_string":"Shenzhen Research Institute of Big Data, School of Data Science, The Chinese University of Hong Kong, Shenzhen, China","institution_ids":["https://openalex.org/I4210116924","https://openalex.org/I4210099586"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101856962","display_name":"Xiaoxue Gao","orcid":"https://orcid.org/0000-0003-1920-5228"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Xiaoxue Gao","raw_affiliation_strings":["Department of Electrical and Computer and Engineering, National University of Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0003-1920-5228","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer and Engineering, National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5032690182","display_name":"Haizhou Li","orcid":null},"institutions":[{"id":"https://openalex.org/I4210099586","display_name":"Shenzhen Research Institute of Big Data","ror":"https://ror.org/00z1gwf89","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210099586"]},{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haizhou Li","raw_affiliation_strings":["Shenzhen Research Institute of Big Data, School of Data Science, The Chinese University of Hong Kong, Shenzhen, China"],"raw_orcid":"https://orcid.org/0000-0001-9158-9401","affiliations":[{"raw_affiliation_string":"Shenzhen Research Institute of Big Data, School of Data Science, The Chinese University of Hong Kong, Shenzhen, China","institution_ids":["https://openalex.org/I4210116924","https://openalex.org/I4210099586"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100526775"],"corresponding_institution_ids":["https://openalex.org/I4210099586","https://openalex.org/I4210116924"],"apc_list":null,"apc_paid":null,"fwci":0.3311,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.64407565,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"31","issue":null,"first_page":"2055","last_page":"2059"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.993399977684021,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.993399977684021,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9779000282287598,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9466000199317932,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.8231317400932312},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7615408897399902},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.565778374671936},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5548878908157349},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.552254855632782},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.4835074245929718},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4189409017562866},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4188879728317261},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.39984071254730225}],"concepts":[{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.8231317400932312},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7615408897399902},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.565778374671936},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5548878908157349},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.552254855632782},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.4835074245929718},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4189409017562866},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4188879728317261},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.39984071254730225},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lsp.2024.3438541","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2024.3438541","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2799040894","display_name":null,"funder_award_id":"62271432","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2127141656","https://openalex.org/W2933138175","https://openalex.org/W2954930777","https://openalex.org/W2995181338","https://openalex.org/W3015356564","https://openalex.org/W3015734344","https://openalex.org/W3096831136","https://openalex.org/W3197227964","https://openalex.org/W3197580070","https://openalex.org/W3207222250","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4224934179","https://openalex.org/W4226120743","https://openalex.org/W4283324001","https://openalex.org/W4285821318","https://openalex.org/W4288089799","https://openalex.org/W4292976050","https://openalex.org/W4319862218","https://openalex.org/W4319862670","https://openalex.org/W4372260307","https://openalex.org/W4375869259","https://openalex.org/W4385573012","https://openalex.org/W4392979802","https://openalex.org/W6631190155","https://openalex.org/W6769627184","https://openalex.org/W6779216093","https://openalex.org/W6780218876","https://openalex.org/W6803092890","https://openalex.org/W6809739816"],"related_works":["https://openalex.org/W2502115930","https://openalex.org/W2482350142","https://openalex.org/W4246396837","https://openalex.org/W3126451824","https://openalex.org/W4394050964","https://openalex.org/W3211393740","https://openalex.org/W3208049411","https://openalex.org/W3022908591","https://openalex.org/W4285706568","https://openalex.org/W2551249631"],"abstract_inverted_index":{"Human":[0],"language":[1],"can":[2,16],"be":[3,105],"expressed":[4],"in":[5,127],"either":[6],"written":[7],"or":[8,13,75],"spoken":[9],"form,":[10],"i.e.":[11],"text":[12,20,36,126],"speech.":[14],"Humans":[15],"acquire":[17],"knowledge":[18],"from":[19,96,110],"to":[21,33,48,55,83,104,108,148],"improve":[22],"speaking":[23],"and":[24,60,125],"listening.":[25],"However,":[26],"the":[27,99,134,157],"quest":[28],"for":[29],"speech":[30,58,82,97,124],"pre-trained":[31],"models":[32],"leverage":[34],"unpaired":[35,112,123],"has":[37],"just":[38],"started.":[39],"In":[40,115],"this":[41,116],"letter,":[42],"we":[43,67,118],"investigate":[44],"a":[45,51,69,120,128],"new":[46],"way":[47],"pre-train":[49],"such":[50],"joint":[52],"speech-text":[53],"model":[54],"learn":[56],"enhanced":[57],"representations":[59],"benefit":[61],"various":[62,142],"speech-related":[63],"downstream":[64],"tasks.":[65],"Specifically,":[66],"propose":[68],"novel":[70],"pre-training":[71],"method,":[72],"text-guided":[73],"HuBERT,":[74],"T-HuBERT,":[76],"which":[77,145],"performs":[78],"self-supervised":[79],"learning":[80],"over":[81,141],"derive":[84],"phoneme-like":[85,90],"discrete":[86],"representations.":[87],"And":[88],"these":[89],"pseudo-label":[91],"sequences":[92],"are":[93],"firstly":[94],"derived":[95],"via":[98],"generative":[100],"adversarial":[101],"networks":[102],"(GAN)":[103],"statistically":[106],"similar":[107],"those":[109],"additional":[111],"textual":[113],"data.":[114],"way,":[117],"build":[119],"bridge":[121],"between":[122],"unsupervised":[129],"manner.":[130],"Extensive":[131],"experiments":[132],"demonstrate":[133],"significant":[135],"superiority":[136],"of":[137],"our":[138],"proposed":[139],"method":[140],"strong":[143],"baselines,":[144],"achieves":[146],"up":[147],"15.3%":[149],"relative":[150],"Word":[151],"Error":[152],"Rate":[153],"(WER)":[154],"reduction":[155],"on":[156],"LibriSpeech":[158],"dataset.":[159]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-12-26T23:08:49.675405","created_date":"2025-10-10T00:00:00"}
