{"id":"https://openalex.org/W4224930323","doi":"https://doi.org/10.1109/icassp43922.2022.9747374","title":"An Adapter Based Pre-Training for Efficient and Scalable Self-Supervised Speech Representation Learning","display_name":"An Adapter Based Pre-Training for Efficient and Scalable Self-Supervised Speech Representation Learning","publication_year":2022,"publication_date":"2022-04-27","ids":{"openalex":"https://openalex.org/W4224930323","doi":"https://doi.org/10.1109/icassp43922.2022.9747374"},"language":"en","primary_location":{"id":"doi:10.1109/icassp43922.2022.9747374","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747374","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5056415825","display_name":"Samuel Kessler","orcid":"https://orcid.org/0009-0007-4940-8575"},"institutions":[{"id":"https://openalex.org/I40120149","display_name":"University of Oxford","ror":"https://ror.org/052gg0110","country_code":"GB","type":"education","lineage":["https://openalex.org/I40120149"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Samuel Kessler","raw_affiliation_strings":["University of Oxford"],"affiliations":[{"raw_affiliation_string":"University of Oxford","institution_ids":["https://openalex.org/I40120149"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051254142","display_name":"Bethan Thomas","orcid":null},"institutions":[{"id":"https://openalex.org/I4210160618","display_name":"Huawei Technologies (United Kingdom)","ror":"https://ror.org/056gzgs71","country_code":"GB","type":"company","lineage":["https://openalex.org/I2250955327","https://openalex.org/I4210160618"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Bethan Thomas","raw_affiliation_strings":["Huawei R&#x0026;D UK"],"affiliations":[{"raw_affiliation_string":"Huawei R&#x0026;D UK","institution_ids":["https://openalex.org/I4210160618"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5034921884","display_name":"Salah Karout","orcid":null},"institutions":[{"id":"https://openalex.org/I4210160618","display_name":"Huawei Technologies (United Kingdom)","ror":"https://ror.org/056gzgs71","country_code":"GB","type":"company","lineage":["https://openalex.org/I2250955327","https://openalex.org/I4210160618"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Salah Karout","raw_affiliation_strings":["Huawei R&#x0026;D UK"],"affiliations":[{"raw_affiliation_string":"Huawei R&#x0026;D UK","institution_ids":["https://openalex.org/I4210160618"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5056415825"],"corresponding_institution_ids":["https://openalex.org/I40120149"],"apc_list":null,"apc_paid":null,"fwci":1.5582,"has_fulltext":false,"cited_by_count":16,"citation_normalized_percentile":{"value":0.84669252,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"3179","last_page":"3183"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.867723286151886},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.60335773229599},{"id":"https://openalex.org/keywords/adapter","display_name":"Adapter (computing)","score":0.5753995180130005},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5680374503135681},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5537515878677368},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5104693174362183},{"id":"https://openalex.org/keywords/forgetting","display_name":"Forgetting","score":0.4965053200721741},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4950689375400543},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.494337797164917},{"id":"https://openalex.org/keywords/multi-task-learning","display_name":"Multi-task learning","score":0.4455896019935608},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4244970381259918},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.42066746950149536},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.16127288341522217}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.867723286151886},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.60335773229599},{"id":"https://openalex.org/C177284502","wikidata":"https://www.wikidata.org/wiki/Q1005390","display_name":"Adapter (computing)","level":2,"score":0.5753995180130005},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5680374503135681},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5537515878677368},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5104693174362183},{"id":"https://openalex.org/C7149132","wikidata":"https://www.wikidata.org/wiki/Q1377840","display_name":"Forgetting","level":2,"score":0.4965053200721741},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4950689375400543},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.494337797164917},{"id":"https://openalex.org/C28006648","wikidata":"https://www.wikidata.org/wiki/Q6934509","display_name":"Multi-task learning","level":3,"score":0.4455896019935608},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4244970381259918},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.42066746950149536},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.16127288341522217},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp43922.2022.9747374","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747374","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.7300000190734863}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2060277733","https://openalex.org/W2473930607","https://openalex.org/W2547875792","https://openalex.org/W2548228487","https://openalex.org/W2560647685","https://openalex.org/W2842511635","https://openalex.org/W2896457183","https://openalex.org/W2933138175","https://openalex.org/W2963211188","https://openalex.org/W2964303773","https://openalex.org/W2972943112","https://openalex.org/W2973049979","https://openalex.org/W2973157397","https://openalex.org/W2995181338","https://openalex.org/W3030437843","https://openalex.org/W3036601975","https://openalex.org/W3101498587","https://openalex.org/W3134307371","https://openalex.org/W3153675281","https://openalex.org/W3209059054","https://openalex.org/W4297808394","https://openalex.org/W4301163820","https://openalex.org/W4385245566","https://openalex.org/W6729448088","https://openalex.org/W6730091202","https://openalex.org/W6738045163","https://openalex.org/W6739901393","https://openalex.org/W6741217325","https://openalex.org/W6755207826","https://openalex.org/W6759579507","https://openalex.org/W6761176036","https://openalex.org/W6771467084","https://openalex.org/W6777155081","https://openalex.org/W6780218876","https://openalex.org/W6780226713","https://openalex.org/W6791141078"],"related_works":["https://openalex.org/W4281760909","https://openalex.org/W2237537322","https://openalex.org/W2950678851","https://openalex.org/W4301248618","https://openalex.org/W4321593827","https://openalex.org/W2165343651","https://openalex.org/W2242427765","https://openalex.org/W2075830955","https://openalex.org/W2343790552","https://openalex.org/W4309969809"],"abstract_inverted_index":{"We":[0,84,117,154],"present":[1],"a":[2,35,52,59,125,139],"method":[3],"for":[4,74],"transferring":[5],"pre-trained":[6],"self-supervised":[7,23],"(SSL)":[8],"speech":[9,40,163],"representations":[10,24,99,160],"to":[11,38,82,121,161],"multiple":[12],"languages.":[13],"There":[14],"is":[15,34],"an":[16,86],"abundance":[17],"of":[18,62,70,95,112],"unannotated":[19],"speech,":[20],"so":[21,108],"creating":[22],"from":[25],"raw":[26,49],"audio":[27,50],"and":[28,55,91,143],"fine-tuning":[29],"on":[30,48,58],"small":[31,60],"annotated":[32,63],"datasets":[33],"promising":[36],"direction":[37],"build":[39],"recognition":[41],"systems.":[42],"SSL":[43,47],"models":[44,66,78],"generally":[45],"perform":[46],"in":[51],"pre-training":[53,124,133],"phase":[54],"then":[56],"fine-tune":[57],"fraction":[61],"data.":[64],"Such":[65],"have":[67],"produced":[68],"state":[69],"the":[71,93,113],"art":[72],"results":[73],"ASR.":[75],"However,":[76],"these":[77,158],"are":[79],"very":[80],"expensive":[81],"pre-train.":[83],"use":[85,118],"existing":[87,102,114],"wav2vec":[88],"2.0":[89],"model":[90,103,130],"tackle":[92],"problem":[94],"learning":[96,138],"new":[97,126,140,146],"language":[98,115,127,141,152,159],"while":[100],"utilizing":[101],"knowledge.":[104],"Crucially":[105],"we":[106],"do":[107],"without":[109,149],"catastrophic":[110],"forgetting":[111,150],"representation.":[116,153],"adapter":[119],"modules":[120],"speed":[122],"up":[123],"task.":[128],"Our":[129],"can":[131],"decrease":[132],"times":[134],"by":[135,156],"32%":[136],"when":[137],"task,":[142],"learn":[144],"this":[145],"audio-language":[147],"representation":[148],"previous":[151],"evaluate":[155],"applying":[157],"automatic":[162],"recognition.":[164]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":8},{"year":2022,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
