{"id":"https://openalex.org/W3206252155","doi":"https://doi.org/10.1109/icassp43922.2022.9747077","title":"Unispeech-Sat: Universal Speech Representation Learning With Speaker Aware Pre-Training","display_name":"Unispeech-Sat: Universal Speech Representation Learning With Speaker Aware Pre-Training","publication_year":2022,"publication_date":"2022-04-27","ids":{"openalex":"https://openalex.org/W3206252155","doi":"https://doi.org/10.1109/icassp43922.2022.9747077","mag":"3206252155"},"language":"en","primary_location":{"id":"doi:10.1109/icassp43922.2022.9747077","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747077","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5079533447","display_name":"Sanyuan Chen","orcid":"https://orcid.org/0000-0002-3082-6052"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]},{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["CN","FI"],"is_corresponding":true,"raw_author_name":"Sanyuan Chen","raw_affiliation_strings":["Harbin Institute of Technology,China","Harbin Institute of Technology, China","Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology,China","institution_ids":["https://openalex.org/I204983213"]},{"raw_affiliation_string":"Harbin Institute of Technology, China","institution_ids":["https://openalex.org/I204983213"]},{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101709477","display_name":"Yu Wu","orcid":"https://orcid.org/0000-0002-5715-3011"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Yu Wu","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101863385","display_name":"Chengyi Wang","orcid":"https://orcid.org/0000-0002-6780-9299"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Chengyi Wang","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101416769","display_name":"Zhengyang Chen","orcid":"https://orcid.org/0000-0003-1293-8146"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Zhengyang Chen","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100345092","display_name":"Zhuo Chen","orcid":"https://orcid.org/0000-0002-9011-7928"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Zhuo Chen","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101635405","display_name":"Shujie Liu","orcid":"https://orcid.org/0009-0008-0785-8882"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Shujie Liu","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101674460","display_name":"Jian Wu","orcid":"https://orcid.org/0000-0002-3101-7011"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Jian Wu","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100342006","display_name":"Yao Qian","orcid":"https://orcid.org/0000-0003-1855-9630"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Yao Qian","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014662947","display_name":"Furu Wei","orcid":"https://orcid.org/0000-0002-7810-5852"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Furu Wei","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100365053","display_name":"Jinyu Li","orcid":"https://orcid.org/0000-0002-1089-9748"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Jinyu Li","raw_affiliation_strings":["Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation","institution_ids":["https://openalex.org/I4210105678"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5072540013","display_name":"Xiangzhan Yu","orcid":"https://orcid.org/0000-0002-1183-2844"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiangzhan Yu","raw_affiliation_strings":["Harbin Institute of Technology,China","Harbin Institute of Technology, China"],"affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology,China","institution_ids":["https://openalex.org/I204983213"]},{"raw_affiliation_string":"Harbin Institute of Technology, China","institution_ids":["https://openalex.org/I204983213"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5079533447"],"corresponding_institution_ids":["https://openalex.org/I204983213","https://openalex.org/I4210105678"],"apc_list":null,"apc_paid":null,"fwci":7.5107,"has_fulltext":false,"cited_by_count":81,"citation_normalized_percentile":{"value":0.97959724,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"6152","last_page":"6156"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9941999912261963,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8420337438583374},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.6594366431236267},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6402137279510498},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6157636046409607},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.5889496207237244},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.5672709345817566},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.528735876083374},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.4985160827636719},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4934627115726471},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4858929514884949},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3687823414802551}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8420337438583374},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.6594366431236267},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6402137279510498},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6157636046409607},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.5889496207237244},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.5672709345817566},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.528735876083374},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.4985160827636719},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4934627115726471},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4858929514884949},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3687823414802551},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp43922.2022.9747077","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747077","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.47999998927116394},{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.47999998927116394}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2547875792","https://openalex.org/W2842511635","https://openalex.org/W2972943112","https://openalex.org/W2973049979","https://openalex.org/W2979476256","https://openalex.org/W2982223350","https://openalex.org/W2995181338","https://openalex.org/W2996383576","https://openalex.org/W3015213852","https://openalex.org/W3016011332","https://openalex.org/W3016181583","https://openalex.org/W3036601975","https://openalex.org/W3041561163","https://openalex.org/W3095292526","https://openalex.org/W3097286738","https://openalex.org/W3099782249","https://openalex.org/W3119308075","https://openalex.org/W3169320628","https://openalex.org/W3169688220","https://openalex.org/W3197580070","https://openalex.org/W3198694222","https://openalex.org/W3198858531","https://openalex.org/W3204696009","https://openalex.org/W3209059054","https://openalex.org/W4297808394","https://openalex.org/W6629717138","https://openalex.org/W6729448088","https://openalex.org/W6769196770","https://openalex.org/W6780218876","https://openalex.org/W6780361010","https://openalex.org/W6784776607","https://openalex.org/W6802600657"],"related_works":["https://openalex.org/W2206035908","https://openalex.org/W2162158162","https://openalex.org/W4247736853","https://openalex.org/W1493012537","https://openalex.org/W2175373321","https://openalex.org/W2125642021","https://openalex.org/W4310979479","https://openalex.org/W2696990509","https://openalex.org/W1999004162","https://openalex.org/W1521049138"],"abstract_inverted_index":{"Self-supervised":[0],"learning":[1,30,76],"(SSL)":[2],"is":[3,154],"a":[4],"long-standing":[5],"goal":[6],"for":[7,42,57,65,94,104,146],"speech":[8,32],"processing,":[9],"since":[10],"it":[11],"utilizes":[12],"large-scale":[13],"unlabeled":[14],"data":[15,105,177],"and":[16,114,178],"avoids":[17],"extensive":[18],"human":[19],"labeling.":[20],"Recent":[21],"years":[22],"have":[23],"witnessed":[24],"great":[25],"successes":[26],"in":[27,31,39,141,183],"applying":[28,40],"self-supervised":[29],"recognition,":[33],"while":[34],"limited":[35],"exploration":[36],"was":[37],"attempted":[38],"SSL":[41,55,80,90],"modeling":[43],"speaker":[44,58,69,96,147],"characteristics.":[45],"In":[46],"this":[47],"paper,":[48],"we":[49,73,83,98,164],"aim":[50],"to":[51,77,170],"improve":[52],"the":[53,67,78,89,120,124,130,135,157,167],"existing":[54],"framework":[56],"representation":[59,143],"learning.":[60],"Two":[61],"methods":[62,122],"are":[63,111],"introduced":[64],"enhancing":[66],"unsupervised":[68],"information":[70],"extraction.":[71],"First,":[72],"apply":[74],"multi-task":[75],"current":[79],"framework,":[81],"where":[82,107],"integrate":[84,119],"utterance-wise":[85],"contrastive":[86],"loss":[87],"with":[88],"objective":[91],"function.":[92],"Second,":[93],"better":[95],"discrimination,":[97],"propose":[99],"an":[100],"utterance":[101],"mixing":[102],"strategy":[103],"augmentation,":[106],"additional":[108],"overlapped":[109],"utterances":[110],"created":[112],"unsupervisely":[113],"incorporated":[115],"during":[116],"training.":[117],"We":[118],"proposed":[121,136,161],"into":[123],"HuBERT":[125],"framework.":[126],"Experiment":[127],"results":[128],"on":[129],"SUPERB":[131,185],"benchmark":[132],"show":[133],"that":[134],"system":[137],"achieves":[138],"state-of-the-art":[139],"performance":[140,181],"universal":[142],"learning,":[144],"especially":[145],"identification":[148],"oriented":[149],"tasks.":[150,186],"An":[151],"ablation":[152],"study":[153],"performed":[155],"verifying":[156],"efficacy":[158],"of":[159,174],"each":[160],"method.":[162],"Finally,":[163],"scale":[165],"up":[166],"training":[168],"dataset":[169],"94":[171],"thousand":[172],"hours":[173],"public":[175],"audio":[176],"achieve":[179],"further":[180],"improvement":[182],"all":[184]},"counts_by_year":[{"year":2026,"cited_by_count":8},{"year":2025,"cited_by_count":25},{"year":2024,"cited_by_count":18},{"year":2023,"cited_by_count":18},{"year":2022,"cited_by_count":11},{"year":2021,"cited_by_count":1}],"updated_date":"2026-04-23T09:07:50.710637","created_date":"2025-10-10T00:00:00"}
