{"id":"https://openalex.org/W4385823427","doi":"https://doi.org/10.21437/interspeech.2023-343","title":"Investigating Pre-trained Audio Encoders in the Low-Resource Condition","display_name":"Investigating Pre-trained Audio Encoders in the Low-Resource Condition","publication_year":2023,"publication_date":"2023-08-14","ids":{"openalex":"https://openalex.org/W4385823427","doi":"https://doi.org/10.21437/interspeech.2023-343"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2023-343","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2023-343","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"INTERSPEECH 2023","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://researchmgt.monash.edu/ws/files/523733686/507015944_oa.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101895792","display_name":"Hao Yang","orcid":"https://orcid.org/0009-0005-9035-8181"},"institutions":[{"id":"https://openalex.org/I56590836","display_name":"Monash University","ror":"https://ror.org/02bfwt286","country_code":"AU","type":"education","lineage":["https://openalex.org/I56590836"]}],"countries":["AU"],"is_corresponding":true,"raw_author_name":"Hao Yang","raw_affiliation_strings":["Department of Data Science & AI, Monash University"],"affiliations":[{"raw_affiliation_string":"Department of Data Science & AI, Monash University","institution_ids":["https://openalex.org/I56590836"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100531552","display_name":"Jinming Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I56590836","display_name":"Monash University","ror":"https://ror.org/02bfwt286","country_code":"AU","type":"education","lineage":["https://openalex.org/I56590836"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Jinming Zhao","raw_affiliation_strings":["Department of Data Science & AI, Monash University"],"affiliations":[{"raw_affiliation_string":"Department of Data Science & AI, Monash University","institution_ids":["https://openalex.org/I56590836"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081525024","display_name":"Gholamreza Haffari","orcid":"https://orcid.org/0000-0001-7326-8380"},"institutions":[{"id":"https://openalex.org/I56590836","display_name":"Monash University","ror":"https://ror.org/02bfwt286","country_code":"AU","type":"education","lineage":["https://openalex.org/I56590836"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Gholamreza Haffari","raw_affiliation_strings":["Department of Data Science & AI, Monash University"],"affiliations":[{"raw_affiliation_string":"Department of Data Science & AI, Monash University","institution_ids":["https://openalex.org/I56590836"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5086032589","display_name":"Ehsan Shareghi","orcid":null},"institutions":[{"id":"https://openalex.org/I56590836","display_name":"Monash University","ror":"https://ror.org/02bfwt286","country_code":"AU","type":"education","lineage":["https://openalex.org/I56590836"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Ehsan Shareghi","raw_affiliation_strings":["Department of Data Science & AI, Monash University"],"affiliations":[{"raw_affiliation_string":"Department of Data Science & AI, Monash University","institution_ids":["https://openalex.org/I56590836"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5101895792"],"corresponding_institution_ids":["https://openalex.org/I56590836"],"apc_list":null,"apc_paid":null,"fwci":0.5392,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.71850878,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1498","last_page":"1502"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9955999851226807,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.8840572834014893},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8475344777107239},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.6268641948699951},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5698975324630737},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5616313815116882},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.5081303119659424},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4616948366165161},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.34437671303749084},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.06338393688201904},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.059416383504867554},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.05913674831390381}],"concepts":[{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.8840572834014893},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8475344777107239},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.6268641948699951},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5698975324630737},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5616313815116882},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.5081303119659424},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4616948366165161},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34437671303749084},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.06338393688201904},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.059416383504867554},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.05913674831390381},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0},{"id":"https://openalex.org/C50522688","wikidata":"https://www.wikidata.org/wiki/Q189833","display_name":"Economic growth","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.21437/interspeech.2023-343","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2023-343","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"INTERSPEECH 2023","raw_type":"proceedings-article"},{"id":"pmh:oai:monash.edu:publications/9043544d-2a06-45e1-9770-b79396a1b2b0","is_oa":true,"landing_page_url":"https://research.monash.edu/en/publications/9043544d-2a06-45e1-9770-b79396a1b2b0","pdf_url":"https://researchmgt.monash.edu/ws/files/523733686/507015944_oa.pdf","source":{"id":"https://openalex.org/S4306402625","display_name":"Monash University Research Portal (Monash University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I56590836","host_organization_name":"Monash University","host_organization_lineage":["https://openalex.org/I56590836"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Yang, H, Zhao, J, Haffari, R & Shareghi, E 2023, Investigating pre-trained audio encoders in the low-resource condition. in N Harte, J Berndsen & G Jones (eds), Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH 2023. International Speech Communication Association (ISCA), Dublin Ireland, pp. 1498-1502, Annual Conference of the International Speech Communication Association 2023, Dublin, Ireland, 20/08/23. https://doi.org/10.21437/Interspeech.2023-343","raw_type":"contributionToPeriodical"},{"id":"pmh:oai:monash.edu:openaire/9043544d-2a06-45e1-9770-b79396a1b2b0","is_oa":true,"landing_page_url":"https://research.monash.edu/files/523733686/507015944_oa.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306402625","display_name":"Monash University Research Portal (Monash University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I56590836","host_organization_name":"Monash University","host_organization_lineage":["https://openalex.org/I56590836"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Yang, H, Zhao, J, Haffari, R & Shareghi, E 2023, Investigating pre-trained audio encoders in the low-resource condition. in N Harte, J Berndsen & G Jones (eds), Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH 2023. International Speech Communication Association (ISCA), Dublin Ireland, pp. 1498-1502, Annual Conference of the International Speech Communication Association 2023, Dublin, Ireland, 20/08/23. https://doi.org/10.21437/Interspeech.2023-343","raw_type":"contributionToPeriodical"}],"best_oa_location":{"id":"pmh:oai:monash.edu:publications/9043544d-2a06-45e1-9770-b79396a1b2b0","is_oa":true,"landing_page_url":"https://research.monash.edu/en/publications/9043544d-2a06-45e1-9770-b79396a1b2b0","pdf_url":"https://researchmgt.monash.edu/ws/files/523733686/507015944_oa.pdf","source":{"id":"https://openalex.org/S4306402625","display_name":"Monash University Research Portal (Monash University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I56590836","host_organization_name":"Monash University","host_organization_lineage":["https://openalex.org/I56590836"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Yang, H, Zhao, J, Haffari, R & Shareghi, E 2023, Investigating pre-trained audio encoders in the low-resource condition. in N Harte, J Berndsen & G Jones (eds), Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH 2023. International Speech Communication Association (ISCA), Dublin Ireland, pp. 1498-1502, Annual Conference of the International Speech Communication Association 2023, Dublin, Ireland, 20/08/23. https://doi.org/10.21437/Interspeech.2023-343","raw_type":"contributionToPeriodical"},"sustainable_development_goals":[{"score":0.44999998807907104,"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4385823427.pdf"},"referenced_works_count":23,"referenced_works":["https://openalex.org/W2518186251","https://openalex.org/W2896457183","https://openalex.org/W2973049979","https://openalex.org/W3036601975","https://openalex.org/W3105816068","https://openalex.org/W3121914243","https://openalex.org/W3162133897","https://openalex.org/W3197580070","https://openalex.org/W3198771897","https://openalex.org/W3199531394","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W3213730158","https://openalex.org/W4226380987","https://openalex.org/W4281492411","https://openalex.org/W4283834483","https://openalex.org/W4287989344","https://openalex.org/W4297808394","https://openalex.org/W4311000453","https://openalex.org/W4311137818","https://openalex.org/W4375869259","https://openalex.org/W4385245566","https://openalex.org/W4385574164"],"related_works":["https://openalex.org/W4390516098","https://openalex.org/W2181948922","https://openalex.org/W2384362569","https://openalex.org/W2142795561","https://openalex.org/W4205302943","https://openalex.org/W2561132942","https://openalex.org/W3155418658","https://openalex.org/W4243199227","https://openalex.org/W2379948177","https://openalex.org/W2334580170"],"abstract_inverted_index":{"Pre-trained":[0],"speech":[1,12,59],"encoders":[2,22,49,92],"have":[3],"been":[4],"central":[5],"to":[6,28],"pushing":[7],"state-of-the-art":[8,48],"results":[9],"across":[10,57],"various":[11,66],"understanding":[13,60],"and":[14,61,68,76,93,124],"generation":[15,62],"tasks.":[16,63],"Nonetheless,":[17],"the":[18,54,80,87,94,109,113],"capabilities":[19,116],"of":[20,40,46,79,90,122],"these":[21,91],"in":[23,53,96,101,120],"low-resource":[24,55,115],"settings":[25],"are":[26],"yet":[27],"be":[29],"thoroughly":[30],"explored.":[31],"To":[32],"address":[33],"this,":[34],"we":[35,107],"conduct":[36],"a":[37,43,84],"comprehensive":[38],"set":[39,45],"experiments":[41],"using":[42],"representative":[44],"3":[47],"(Wav2vec2,":[50],"WavLM,":[51],"Whisper)":[52],"setting":[56],"7":[58],"We":[64,82],"provide":[65],"quantitative":[67],"qualitative":[69],"analyses":[70],"on":[71,117],"task":[72],"performance,":[73],"convergence":[74,125],"speed,":[75],"representational":[77],"properties":[78],"encoders.":[81],"observe":[83,108],"connection":[85],"between":[86],"pre-training":[88],"protocols":[89],"way":[95],"which":[97],"they":[98],"capture":[99],"information":[100],"their":[102],"internal":[103],"layers.":[104],"In":[105],"particular,":[106],"Whisper":[110],"encoder":[111],"exhibits":[112],"greatest":[114],"content-driven":[118],"tasks":[119],"terms":[121],"performance":[123],"speed.":[126]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2}],"updated_date":"2026-03-11T14:59:36.786465","created_date":"2025-10-10T00:00:00"}
