{"id":"https://openalex.org/W4372260307","doi":"https://doi.org/10.1109/icassp49357.2023.10096923","title":"Token2vec: A Joint Self-Supervised Pre-Training Framework Using Unpaired Speech and Text","display_name":"Token2vec: A Joint Self-Supervised Pre-Training Framework Using Unpaired Speech and Text","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372260307","doi":"https://doi.org/10.1109/icassp49357.2023.10096923"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10096923","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp49357.2023.10096923","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5060201191","display_name":"Xianghu Yue","orcid":"https://orcid.org/0000-0003-3527-6034"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Xianghu Yue","raw_affiliation_strings":["National University of Singapore,Department of Electrical and Computer Engineering,Singapore","Department of Electrical and Computer Engineering, National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore,Department of Electrical and Computer Engineering,Singapore","institution_ids":["https://openalex.org/I165932596"]},{"raw_affiliation_string":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077069016","display_name":"Junyi Ao","orcid":"https://orcid.org/0000-0001-8979-0835"},"institutions":[{"id":"https://openalex.org/I4210099586","display_name":"Shenzhen Research Institute of Big Data","ror":"https://ror.org/00z1gwf89","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210099586"]},{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junyi Ao","raw_affiliation_strings":["Shenzhen Research Institute of Big Data,Shenzhen,China","Shenzhen Research Institute of Big Data, Shenzhen, China","School of Data Science, The Chinese University of Hong Kong, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen Research Institute of Big Data,Shenzhen,China","institution_ids":["https://openalex.org/I4210099586"]},{"raw_affiliation_string":"Shenzhen Research Institute of Big Data, Shenzhen, China","institution_ids":["https://openalex.org/I4210099586"]},{"raw_affiliation_string":"School of Data Science, The Chinese University of Hong Kong, Shenzhen, China","institution_ids":["https://openalex.org/I4210116924"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101856962","display_name":"Xiaoxue Gao","orcid":"https://orcid.org/0000-0003-1920-5228"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Xiaoxue Gao","raw_affiliation_strings":["National University of Singapore,Department of Electrical and Computer Engineering,Singapore","Department of Electrical and Computer Engineering, National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore,Department of Electrical and Computer Engineering,Singapore","institution_ids":["https://openalex.org/I165932596"]},{"raw_affiliation_string":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5032690182","display_name":"Haizhou Li","orcid":"https://orcid.org/0000-0001-9158-9401"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]},{"id":"https://openalex.org/I4210099586","display_name":"Shenzhen Research Institute of Big Data","ror":"https://ror.org/00z1gwf89","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210099586"]},{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]}],"countries":["CN","SG"],"is_corresponding":false,"raw_author_name":"Haizhou Li","raw_affiliation_strings":["National University of Singapore,Department of Electrical and Computer Engineering,Singapore","School of Data Science, The Chinese University of Hong Kong, Shenzhen, China","Shenzhen Research Institute of Big Data, Shenzhen, China","Department of Electrical and Computer Engineering, National University of Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore,Department of Electrical and Computer Engineering,Singapore","institution_ids":["https://openalex.org/I165932596"]},{"raw_affiliation_string":"School of Data Science, The Chinese University of Hong Kong, Shenzhen, China","institution_ids":["https://openalex.org/I4210116924"]},{"raw_affiliation_string":"Shenzhen Research Institute of Big Data, Shenzhen, China","institution_ids":["https://openalex.org/I4210099586"]},{"raw_affiliation_string":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5060201191"],"corresponding_institution_ids":["https://openalex.org/I165932596"],"apc_list":null,"apc_paid":null,"fwci":1.2148,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.82773075,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9962000250816345,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8100882172584534},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7020933628082275},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6857497096061707},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5680463314056396},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5488312244415283},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5396866798400879},{"id":"https://openalex.org/keywords/speech-analytics","display_name":"Speech analytics","score":0.5257859230041504},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.48201218247413635},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4435039162635803},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4214388132095337},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.36424607038497925},{"id":"https://openalex.org/keywords/speech-corpus","display_name":"Speech corpus","score":0.3553149104118347}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8100882172584534},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7020933628082275},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6857497096061707},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5680463314056396},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5488312244415283},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5396866798400879},{"id":"https://openalex.org/C54953205","wikidata":"https://www.wikidata.org/wiki/Q4142201","display_name":"Speech analytics","level":4,"score":0.5257859230041504},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.48201218247413635},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4435039162635803},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4214388132095337},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.36424607038497925},{"id":"https://openalex.org/C91863865","wikidata":"https://www.wikidata.org/wiki/Q4349497","display_name":"Speech corpus","level":3,"score":0.3553149104118347},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C170154142","wikidata":"https://www.wikidata.org/wiki/Q150737","display_name":"Architectural engineering","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10096923","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp49357.2023.10096923","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320322942","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48"},{"id":"https://openalex.org/F4320331102","display_name":"Shenzhen Research Institute of Big Data","ror":"https://ror.org/00z1gwf89"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":51,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1522301498","https://openalex.org/W2896457183","https://openalex.org/W2923014074","https://openalex.org/W2933138175","https://openalex.org/W2943552823","https://openalex.org/W2965373594","https://openalex.org/W2970597249","https://openalex.org/W2979476256","https://openalex.org/W2995181338","https://openalex.org/W3011411500","https://openalex.org/W3015356564","https://openalex.org/W3035579820","https://openalex.org/W3036601975","https://openalex.org/W3148001440","https://openalex.org/W3162534564","https://openalex.org/W3180374548","https://openalex.org/W3197580070","https://openalex.org/W3207222250","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4221145109","https://openalex.org/W4221155340","https://openalex.org/W4223646224","https://openalex.org/W4226120743","https://openalex.org/W4283324001","https://openalex.org/W4285819380","https://openalex.org/W4286359908","https://openalex.org/W4300980246","https://openalex.org/W4381786045","https://openalex.org/W4385245566","https://openalex.org/W4394671563","https://openalex.org/W6631190155","https://openalex.org/W6739901393","https://openalex.org/W6750615492","https://openalex.org/W6755207826","https://openalex.org/W6762392948","https://openalex.org/W6763701032","https://openalex.org/W6766673545","https://openalex.org/W6769196770","https://openalex.org/W6771812881","https://openalex.org/W6779271971","https://openalex.org/W6780218876","https://openalex.org/W6790356757","https://openalex.org/W6803092890","https://openalex.org/W6810007534","https://openalex.org/W6810062440","https://openalex.org/W6810259195","https://openalex.org/W6840472489","https://openalex.org/W6843330092","https://openalex.org/W6845338303"],"related_works":["https://openalex.org/W4200068392","https://openalex.org/W2772686614","https://openalex.org/W2152945827","https://openalex.org/W2015513221","https://openalex.org/W2032286903","https://openalex.org/W2181773877","https://openalex.org/W301864623","https://openalex.org/W2036933852","https://openalex.org/W2184371793","https://openalex.org/W2014684632"],"abstract_inverted_index":{"Self-supervised":[0],"pre-training":[1,31,46,56,150],"has":[2],"been":[3],"successful":[4],"in":[5],"both":[6],"text":[7,13,62,125],"and":[8,12,35,50,61,77,103,116,124,132,172],"speech":[9,34,60,76,115,123],"processing.":[10],"Speech":[11],"offer":[14],"different":[15],"but":[16],"complementary":[17],"information.":[18],"The":[19],"question":[20],"is":[21,144,161],"whether":[22],"we":[23,40,70,83,119],"are":[24,108],"able":[25],"to":[26,147,154],"perform":[27],"a":[28,53,128,165],"speech-text":[29],"joint":[30,55],"on":[32,64,80,164],"unpaired":[33,59],"text.":[36,78,117],"In":[37],"this":[38],"paper,":[39],"take":[41],"the":[42,99,111,121],"idea":[43],"of":[44,67,93],"self-supervised":[45],"one":[47],"step":[48],"further":[49],"propose":[51],"token2vec,":[52],"novel":[54],"framework":[57],"for":[58,75],"based":[63],"discrete":[65,88,122],"representations":[66],"speech.":[68],"Specifically,":[69],"introduce":[71],"two":[72],"modality-specific":[73],"tokenizers":[74],"Based":[79],"these":[81],"tokenizers,":[82],"convert":[84],"speech/text":[85,89],"sequences":[86,91],"into":[87,127],"token":[90],"consisting":[92],"similar":[94],"language":[95,137],"units,":[96],"thus":[97],"mitigating":[98],"domain":[100],"mismatch":[101,105],"problem":[102],"length":[104],"problem,":[106],"which":[107],"caused":[109],"by":[110],"distinct":[112],"characteristics":[113],"between":[114],"Finally,":[118],"feed":[120],"tokens":[126],"modality-agnostic":[129],"Transformer":[130],"encoder":[131],"pre-train":[133],"with":[134,152],"token-level":[135],"masking":[136],"modeling":[138],"(tMLM).":[139],"Experiments":[140],"show":[141],"that":[142],"token2vec":[143],"significantly":[145],"superior":[146],"various":[148],"speech-only":[149],"baselines,":[151],"up":[153],"17.7%":[155],"relative":[156],"WER":[157],"reduction.":[158],"Token2vec":[159],"model":[160],"also":[162],"validated":[163],"non-ASR":[166],"task,":[167],"i.e.,":[168],"spoken":[169],"intent":[170],"classification,":[171],"shows":[173],"good":[174],"transferability.":[175]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":1}],"updated_date":"2026-04-04T06:10:10.580331","created_date":"2025-10-10T00:00:00"}
