{"id":"https://openalex.org/W4392902846","doi":"https://doi.org/10.1109/icassp48485.2024.10446342","title":"uaMix-MAE: Efficient Tuning of Pretrained Audio Transformers with Unsupervised Audio Mixtures","display_name":"uaMix-MAE: Efficient Tuning of Pretrained Audio Transformers with Unsupervised Audio Mixtures","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392902846","doi":"https://doi.org/10.1109/icassp48485.2024.10446342"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10446342","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp48485.2024.10446342","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5023205287","display_name":"Afrina Tabassum","orcid":null},"institutions":[{"id":"https://openalex.org/I859038795","display_name":"Virginia Tech","ror":"https://ror.org/02smfhw86","country_code":"US","type":"education","lineage":["https://openalex.org/I859038795"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Afrina Tabassum","raw_affiliation_strings":["Virginia Tech"],"affiliations":[{"raw_affiliation_string":"Virginia Tech","institution_ids":["https://openalex.org/I859038795"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101507065","display_name":"Dung Tran","orcid":null},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]},{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["FI","US"],"is_corresponding":false,"raw_author_name":"Dung Tran","raw_affiliation_strings":["Microsoft Corporation,Applied Sciences Group","Applied Sciences Group, Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation,Applied Sciences Group","institution_ids":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]},{"raw_affiliation_string":"Applied Sciences Group, Microsoft Corporation","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040335961","display_name":"Trung Dang","orcid":"https://orcid.org/0009-0002-9914-5009"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]},{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]}],"countries":["FI","US"],"is_corresponding":false,"raw_author_name":"Trung Dang","raw_affiliation_strings":["Microsoft Corporation,Applied Sciences Group","Applied Sciences Group, Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation,Applied Sciences Group","institution_ids":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]},{"raw_affiliation_string":"Applied Sciences Group, Microsoft Corporation","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043962698","display_name":"Ismini Lourentzou","orcid":"https://orcid.org/0000-0002-1238-772X"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]},{"id":"https://openalex.org/I859038795","display_name":"Virginia Tech","ror":"https://ror.org/02smfhw86","country_code":"US","type":"education","lineage":["https://openalex.org/I859038795"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ismini Lourentzou","raw_affiliation_strings":["Virginia Tech","University of Illinois at Urbana - Champaign"],"affiliations":[{"raw_affiliation_string":"Virginia Tech","institution_ids":["https://openalex.org/I859038795"]},{"raw_affiliation_string":"University of Illinois at Urbana - Champaign","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5084879161","display_name":"Kazuhito Koishida","orcid":"https://orcid.org/0000-0002-3111-5375"},"institutions":[{"id":"https://openalex.org/I4210105678","display_name":"Microsoft (Finland)","ror":"https://ror.org/01nehjf29","country_code":"FI","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]},{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["FI","US"],"is_corresponding":false,"raw_author_name":"Kazuhito Koishida","raw_affiliation_strings":["Microsoft Corporation,Applied Sciences Group","Applied Sciences Group, Microsoft Corporation"],"affiliations":[{"raw_affiliation_string":"Microsoft Corporation,Applied Sciences Group","institution_ids":["https://openalex.org/I1290206253","https://openalex.org/I4210105678"]},{"raw_affiliation_string":"Applied Sciences Group, Microsoft Corporation","institution_ids":["https://openalex.org/I1290206253"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5023205287"],"corresponding_institution_ids":["https://openalex.org/I859038795"],"apc_list":null,"apc_paid":null,"fwci":0.3768,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.49456551,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"5435","last_page":"5439"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8220496773719788},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.62102210521698},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.49313873052597046},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.47836726903915405},{"id":"https://openalex.org/keywords/labeled-data","display_name":"Labeled data","score":0.4728391766548157},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.45044034719467163},{"id":"https://openalex.org/keywords/unsupervised-learning","display_name":"Unsupervised learning","score":0.4147067070007324},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4061163067817688},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3678586781024933}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8220496773719788},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.62102210521698},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.49313873052597046},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.47836726903915405},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.4728391766548157},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.45044034719467163},{"id":"https://openalex.org/C8038995","wikidata":"https://www.wikidata.org/wiki/Q1152135","display_name":"Unsupervised learning","level":2,"score":0.4147067070007324},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4061163067817688},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3678586781024933},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10446342","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp48485.2024.10446342","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/7","score":0.47999998927116394,"display_name":"Affordable and clean energy"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":48,"referenced_works":["https://openalex.org/W2052666245","https://openalex.org/W2117539524","https://openalex.org/W2187089797","https://openalex.org/W2593116425","https://openalex.org/W2606176153","https://openalex.org/W2726515241","https://openalex.org/W2765407302","https://openalex.org/W2797583228","https://openalex.org/W2884535146","https://openalex.org/W2896457183","https://openalex.org/W2913289886","https://openalex.org/W2992308087","https://openalex.org/W3005680577","https://openalex.org/W3035524453","https://openalex.org/W3094502228","https://openalex.org/W3129576130","https://openalex.org/W3136810184","https://openalex.org/W3158714121","https://openalex.org/W3162391496","https://openalex.org/W3170863103","https://openalex.org/W3176276772","https://openalex.org/W3196974791","https://openalex.org/W3201143670","https://openalex.org/W4224980507","https://openalex.org/W4283810973","https://openalex.org/W4284898017","https://openalex.org/W4285483774","https://openalex.org/W4292779060","https://openalex.org/W4297841853","https://openalex.org/W4312048190","https://openalex.org/W4313069509","https://openalex.org/W4313156423","https://openalex.org/W4322718191","https://openalex.org/W4372341324","https://openalex.org/W4375869340","https://openalex.org/W4385823422","https://openalex.org/W4393159256","https://openalex.org/W6736723571","https://openalex.org/W6745136726","https://openalex.org/W6753583920","https://openalex.org/W6774314701","https://openalex.org/W6778883912","https://openalex.org/W6790978476","https://openalex.org/W6791713434","https://openalex.org/W6796761347","https://openalex.org/W6810510949","https://openalex.org/W6840200333","https://openalex.org/W6848208918"],"related_works":["https://openalex.org/W2361861616","https://openalex.org/W2263699433","https://openalex.org/W2377979023","https://openalex.org/W2218034408","https://openalex.org/W2392921965","https://openalex.org/W2358755282","https://openalex.org/W4389518428","https://openalex.org/W165115930","https://openalex.org/W2096363773","https://openalex.org/W2794908468"],"abstract_inverted_index":{"Masked":[0],"Autoencoders":[1],"(MAEs)":[2],"learn":[3],"rich":[4],"low-level":[5],"representations":[6,88],"from":[7],"unlabeled":[8,107,146],"data":[9,14],"but":[10],"require":[11],"substantial":[12],"labeled":[13,49],"to":[15,18,32,57,96],"effectively":[16],"adapt":[17],"downstream":[19,45],"tasks.":[20],"Conversely,":[21],"Instance":[22],"Discrimination":[23],"(ID)":[24],"emphasizes":[25],"high-level":[26],"semantics,":[27],"offering":[28],"a":[29],"potential":[30],"solution":[31],"alleviate":[33],"annotation":[34],"requirements":[35],"in":[36,119,127],"MAEs.":[37],"Although":[38],"combining":[39],"these":[40],"two":[41],"approaches":[42],"can":[43],"address":[44,66],"tasks":[46],"with":[47,103,144],"limited":[48,145],"data,":[50,108,147],"naively":[51],"integrating":[52],"ID":[53,74],"into":[54],"MAEs":[55],"leads":[56],"extended":[58],"training":[59],"times":[60],"and":[61,122],"high":[62],"computational":[63],"costs.":[64],"To":[65,99],"this":[67],"challenge,":[68],"we":[69,109],"introduce":[70],"uaMix-MAE,":[71],"an":[72,111],"efficient":[73],"tuning":[75],"strategy":[76],"that":[77,115,131],"leverages":[78],"unsupervised":[79],"audio":[80,112,117],"mixtures.":[81],"Utilizing":[82],"contrastive":[83],"tuning,":[84],"uaMix-MAE":[85,132],"aligns":[86],"the":[87,101],"of":[89,106],"pretrained":[90],"MAEs,":[91],"thereby":[92],"facilitating":[93],"effective":[94],"adaptation":[95],"task-specific":[97],"semantics.":[98],"optimize":[100],"model":[102],"small":[104],"amounts":[105],"propose":[110],"mixing":[113],"technique":[114],"manipulates":[116],"samples":[118],"both":[120],"input":[121],"virtual":[123],"label":[124],"spaces.":[125],"Experiments":[126],"low/few-shot":[128],"settings":[129],"demonstrate":[130],"achieves":[133],"4":[134],"\u2212":[135],"6%":[136],"accuracy":[137],"improvements":[138],"over":[139],"various":[140],"benchmarks":[141],"when":[142],"tuned":[143],"such":[148],"as":[149],"AudioSet-20K.":[150]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-12-21T23:12:01.093139","created_date":"2025-10-10T00:00:00"}
