{"id":"https://openalex.org/W4416250900","doi":"https://doi.org/10.1109/waspaa66052.2025.11231014","title":"Multi-Class-Token Transformer for Multitask Self-supervised Music Information Retrieval","display_name":"Multi-Class-Token Transformer for Multitask Self-supervised Music Information Retrieval","publication_year":2025,"publication_date":"2025-10-12","ids":{"openalex":"https://openalex.org/W4416250900","doi":"https://doi.org/10.1109/waspaa66052.2025.11231014"},"language":null,"primary_location":{"id":"doi:10.1109/waspaa66052.2025.11231014","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11231014","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5099057809","display_name":"Yuexuan Kong","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yuexuan Kong","raw_affiliation_strings":["Deezer Research,Paris,France"],"affiliations":[{"raw_affiliation_string":"Deezer Research,Paris,France","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027789239","display_name":"Vincent Lostanlen","orcid":"https://orcid.org/0000-0003-0580-1651"},"institutions":[{"id":"https://openalex.org/I100445878","display_name":"\u00c9cole Centrale de Nantes","ror":"https://ror.org/03nh7d505","country_code":"FR","type":"education","lineage":["https://openalex.org/I100445878","https://openalex.org/I97188460"]},{"id":"https://openalex.org/I1294671590","display_name":"Centre National de la Recherche Scientifique","ror":"https://ror.org/02feahw73","country_code":"FR","type":"government","lineage":["https://openalex.org/I1294671590"]},{"id":"https://openalex.org/I4210117005","display_name":"Laboratoire des Sciences du Num\u00e9rique de Nantes","ror":"https://ror.org/02snf8m58","country_code":"FR","type":"facility","lineage":["https://openalex.org/I100445878","https://openalex.org/I1294671590","https://openalex.org/I1294671590","https://openalex.org/I1294671590","https://openalex.org/I1326498283","https://openalex.org/I205703379","https://openalex.org/I4210117005","https://openalex.org/I4210124215","https://openalex.org/I4210127572","https://openalex.org/I4210139971","https://openalex.org/I97188460","https://openalex.org/I97188460"]},{"id":"https://openalex.org/I97188460","display_name":"Nantes Universit\u00e9","ror":"https://ror.org/03gnr7b55","country_code":"FR","type":"education","lineage":["https://openalex.org/I97188460"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Vincent Lostanlen","raw_affiliation_strings":["Nantes Universit&#x00E9;,Centrale Nantes, CNRS, LS2N, UMR 6004,Nantes,France,F-44000"],"affiliations":[{"raw_affiliation_string":"Nantes Universit&#x00E9;,Centrale Nantes, CNRS, LS2N, UMR 6004,Nantes,France,F-44000","institution_ids":["https://openalex.org/I97188460","https://openalex.org/I100445878","https://openalex.org/I4210117005","https://openalex.org/I1294671590"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004879177","display_name":"Romain Hennequin","orcid":"https://orcid.org/0000-0001-8158-5562"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Romain Hennequin","raw_affiliation_strings":["Deezer Research,Paris,France"],"affiliations":[{"raw_affiliation_string":"Deezer Research,Paris,France","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029522748","display_name":"Mathieu Lagrange","orcid":"https://orcid.org/0000-0002-1253-4427"},"institutions":[{"id":"https://openalex.org/I100445878","display_name":"\u00c9cole Centrale de Nantes","ror":"https://ror.org/03nh7d505","country_code":"FR","type":"education","lineage":["https://openalex.org/I100445878","https://openalex.org/I97188460"]},{"id":"https://openalex.org/I1294671590","display_name":"Centre National de la Recherche Scientifique","ror":"https://ror.org/02feahw73","country_code":"FR","type":"government","lineage":["https://openalex.org/I1294671590"]},{"id":"https://openalex.org/I4210117005","display_name":"Laboratoire des Sciences du Num\u00e9rique de Nantes","ror":"https://ror.org/02snf8m58","country_code":"FR","type":"facility","lineage":["https://openalex.org/I100445878","https://openalex.org/I1294671590","https://openalex.org/I1294671590","https://openalex.org/I1294671590","https://openalex.org/I1326498283","https://openalex.org/I205703379","https://openalex.org/I4210117005","https://openalex.org/I4210124215","https://openalex.org/I4210127572","https://openalex.org/I4210139971","https://openalex.org/I97188460","https://openalex.org/I97188460"]},{"id":"https://openalex.org/I97188460","display_name":"Nantes Universit\u00e9","ror":"https://ror.org/03gnr7b55","country_code":"FR","type":"education","lineage":["https://openalex.org/I97188460"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Mathieu Lagrange","raw_affiliation_strings":["Nantes Universit&#x00E9;,Centrale Nantes, CNRS, LS2N, UMR 6004,Nantes,France,F-44000"],"affiliations":[{"raw_affiliation_string":"Nantes Universit&#x00E9;,Centrale Nantes, CNRS, LS2N, UMR 6004,Nantes,France,F-44000","institution_ids":["https://openalex.org/I97188460","https://openalex.org/I100445878","https://openalex.org/I4210117005","https://openalex.org/I1294671590"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5062693009","display_name":"Gabriel Meseguer-Brocal","orcid":"https://orcid.org/0000-0002-5232-8628"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gabriel Meseguer-Brocal","raw_affiliation_strings":["Deezer Research,Paris,France"],"affiliations":[{"raw_affiliation_string":"Deezer Research,Paris,France","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5099057809"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.45299953,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9186000227928162,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9186000227928162,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.010099999606609344,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.009800000116229057,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/multi-task-learning","display_name":"Multi-task learning","score":0.5758000016212463},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.42100000381469727},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.41519999504089355},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.4146000146865845},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.41119998693466187},{"id":"https://openalex.org/keywords/pretext","display_name":"Pretext","score":0.3982999920845032},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.38519999384880066},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.3776000142097473},{"id":"https://openalex.org/keywords/task-analysis","display_name":"Task analysis","score":0.3499000072479248}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7936999797821045},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5820000171661377},{"id":"https://openalex.org/C28006648","wikidata":"https://www.wikidata.org/wiki/Q6934509","display_name":"Multi-task learning","level":3,"score":0.5758000016212463},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4634999930858612},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.42100000381469727},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.41519999504089355},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.4146000146865845},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.41119998693466187},{"id":"https://openalex.org/C2779627259","wikidata":"https://www.wikidata.org/wiki/Q779763","display_name":"Pretext","level":3,"score":0.3982999920845032},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.38519999384880066},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3828999996185303},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.3776000142097473},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3499000072479248},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3416000008583069},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.33309999108314514},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.32820001244544983},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3140999972820282},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.298799991607666},{"id":"https://openalex.org/C45942800","wikidata":"https://www.wikidata.org/wiki/Q245652","display_name":"Ensemble learning","level":2,"score":0.2824000120162964},{"id":"https://openalex.org/C108154423","wikidata":"https://www.wikidata.org/wiki/Q1469792","display_name":"Salience (neuroscience)","level":2,"score":0.28049999475479126},{"id":"https://openalex.org/C203357204","wikidata":"https://www.wikidata.org/wiki/Q1089605","display_name":"Chunking (psychology)","level":2,"score":0.2736000120639801},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.2728999853134155},{"id":"https://openalex.org/C2777946086","wikidata":"https://www.wikidata.org/wiki/Q1163335","display_name":"Music information retrieval","level":3,"score":0.27250000834465027},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2703999876976013},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.2694000005722046},{"id":"https://openalex.org/C30038468","wikidata":"https://www.wikidata.org/wiki/Q4354775","display_name":"Memorization","level":2,"score":0.26190000772476196},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.25609999895095825},{"id":"https://openalex.org/C2779247141","wikidata":"https://www.wikidata.org/wiki/Q1049294","display_name":"Emoji","level":3,"score":0.25270000100135803}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/waspaa66052.2025.11231014","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11231014","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W1819763199","https://openalex.org/W1989445502","https://openalex.org/W2899260162","https://openalex.org/W3011176162","https://openalex.org/W3159481202","https://openalex.org/W3162528130","https://openalex.org/W4280498166","https://openalex.org/W4293518176","https://openalex.org/W4392904213","https://openalex.org/W4392931767","https://openalex.org/W4398757498","https://openalex.org/W4408353036","https://openalex.org/W4408354298","https://openalex.org/W4414199528"],"related_works":[],"abstract_inverted_index":{"Contrastive":[0],"learning":[1,4,10,144,254],"and":[2,171],"equivariant":[3,143,183],"are":[5,110],"effective":[6,31,39],"methods":[7,51],"for":[8,12,142,158,230,256],"self-supervised":[9,114,127],"(SSL)":[11],"audio":[13],"content":[14],"analysis.":[15],"Yet,":[16],"their":[17],"application":[18],"to":[19,66,112,240],"music":[20],"information":[21],"retrieval":[22],"(MIR)":[23],"faces":[24],"a":[25,74,79,96],"dilemma:":[26],"the":[27,53,120,124,146,151,165,186,197,201,210,216,248],"former":[28,134],"is":[29,57,95],"more":[30],"on":[32,40,52,83,193,215,226],"tagging":[33],"(e.g.,":[34,43],"instrument":[35],"recognition)":[36],"but":[37,60,117],"less":[38],"structured":[41],"prediction":[42],"tonality":[44],"estimation);":[45],"The":[46,91,133],"latter":[47,152],"can":[48],"match":[49],"supervised":[50],"specific":[54],"task":[55],"it":[56,61],"designed":[58],"for,":[59],"does":[62],"not":[63],"generalize":[64],"well":[65],"other":[67],"tasks.":[68],"In":[69],"this":[70,234],"article,":[71],"we":[72],"adopt":[73],"best-of-both-worlds":[75],"approach":[76,255],"by":[77,204],"training":[78],"deep":[80],"neural":[81],"network":[82],"both":[84,168,174],"kinds":[85],"of":[86,126,148,167,200,218,250],"pretext":[87,115,169],"tasks":[88,116,170,228],"at":[89],"once.":[90],"proposed":[92],"new":[93],"architecture":[94],"Vision":[97],"Transformer":[98],"with":[99,105,179,235],"1-D":[100],"spectrogram":[101],"patches":[102],"(ViT-1D),":[103],"equipped":[104],"two":[106,187],"class":[107,135,188,206],"tokens,":[108],"which":[109],"specialized":[111],"different":[113],"optimized":[118],"through":[119],"same":[121,211],"model:":[122],"hence":[123],"qualification":[125],"multi-class-token":[128,252],"multitask":[129,253],"(MT<sup":[130],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[131,162,222],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">2</sup>).":[132],"token":[136],"optimizes":[137,153],"cross-power":[138],"spectral":[139],"density":[140],"(CPSD)":[141],"over":[145],"circle":[147],"fifths,":[149],"while":[150],"normalized":[154],"temperature-scaled":[155],"cross-entropy":[156],"(NT-Xent)":[157],"contrastive":[159,181],"learning.":[160,184],"MT<sup":[161,221],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">2</sup>":[163,223],"combines":[164],"strengths":[166],"outperforms":[172,224],"consistently":[173],"single-class-token":[175],"ViT-1D":[176],"models":[177],"trained":[178],"either":[180],"or":[182],"Averaging":[185],"tokens":[189],"further":[190],"improves":[191],"performance":[192],"several":[194],"tasks,":[195],"highlighting":[196],"complementary":[198],"nature":[199],"representations":[202],"learned":[203],"each":[205],"token.":[207],"Furthermore,":[208],"using":[209],"single-linear-layer":[212],"probing":[213],"method":[214],"features":[217],"last":[219],"layer,":[220],"MERT":[225],"all":[227],"except":[229],"beat":[231],"tracking;":[232],"achieving":[233],"18x":[236],"fewer":[237],"parameters":[238],"thanks":[239],"its":[241],"multitasking":[242],"capabilities.":[243],"Our":[244],"SSL":[245],"benchmark":[246],"demonstrates":[247],"versatility":[249],"our":[251],"MIR":[257],"applications.":[258]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-10T00:00:00"}
