{"id":"https://openalex.org/W7108725614","doi":"https://doi.org/10.5281/zenodo.17811358","title":"Emergent musical properties of a transformer under contrastive self-supervised learning","display_name":"Emergent musical properties of a transformer under contrastive self-supervised learning","publication_year":2025,"publication_date":"2025-09-21","ids":{"openalex":"https://openalex.org/W7108725614","doi":"https://doi.org/10.5281/zenodo.17811358"},"language":null,"primary_location":{"id":"doi:10.5281/zenodo.17811358","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811358","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.5281/zenodo.17811358","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yuexuan KONG","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yuexuan KONG","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Gabriel Mesegues-Brocal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gabriel Mesegues-Brocal","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Vincent Lostanlen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vincent Lostanlen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Mathieu Lagrange","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mathieu Lagrange","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Romain Hennequin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Romain Hennequin","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.63183865,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9639999866485596,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9639999866485596,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.007400000002235174,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10788","display_name":"Neuroscience and Music Perception","score":0.004399999976158142,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6765000224113464},{"id":"https://openalex.org/keywords/chord","display_name":"Chord (peer-to-peer)","score":0.5235000252723694},{"id":"https://openalex.org/keywords/musical","display_name":"Musical","score":0.37220001220703125},{"id":"https://openalex.org/keywords/pretext","display_name":"Pretext","score":0.2800000011920929},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.2583000063896179}],"concepts":[{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6765000224113464},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.603600025177002},{"id":"https://openalex.org/C194147245","wikidata":"https://www.wikidata.org/wiki/Q1076368","display_name":"Chord (peer-to-peer)","level":2,"score":0.5235000252723694},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.47209998965263367},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45260000228881836},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.37220001220703125},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.35030001401901245},{"id":"https://openalex.org/C2779627259","wikidata":"https://www.wikidata.org/wiki/Q779763","display_name":"Pretext","level":3,"score":0.2800000011920929},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2583000063896179},{"id":"https://openalex.org/C45942800","wikidata":"https://www.wikidata.org/wiki/Q245652","display_name":"Ensemble learning","level":2,"score":0.24230000376701355}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5281/zenodo.17811358","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811358","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":""}],"best_oa_location":{"id":"doi:10.5281/zenodo.17811358","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811358","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5800601840019226}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0],"music":[1],"information":[2],"retrieval":[3],"(MIR),":[4],"contrastive":[5,30,55,83,188],"self-supervised":[6],"learning":[7],"is":[8,26,33,40],"effective":[9],"for":[10,18,193],"global":[11,116],"tasks":[12,20],"such":[13,21,147],"as":[14,22,148],"automatic":[15],"tagging.":[16],"However,":[17],"local":[19,62,131],"chord":[23,151],"estimation,":[24],"it":[25,80],"widely":[27],"assumed":[28],"that":[29,36],"pretext":[31],"task":[32],"inadequate":[34],"and":[35,78,150,158,180],"more":[37],"sophisticated":[38],"SSL":[39,56,84,189],"necessary;":[41],"e.g.,":[42],"masked":[43],"modeling.":[44],"Our":[45,161],"paper":[46,162],"challenges":[47],"this":[48],"assumption":[49],"by":[50],"revealing":[51],"the":[52,74,96,127,168,171,175],"potential":[53],"of":[54,170,178,187],"paired":[57,190],"with":[58,70,81,191],"a":[59,67,122],"transformer":[60,69],"in":[61,73,111,196],"MIR":[63],"tasks.":[64],"We":[65],"consider":[66],"vision":[68],"one-dimensional":[71],"patches":[72],"time--frequency":[75],"domain":[76],"(ViT-1D)":[77],"train":[79],"simple":[82],"through":[85],"normalized":[86],"temperature-scaled":[87],"cross-entropy":[88],"loss":[89],"(NT-Xent).":[90],"Although":[91],"NT-Xent":[92],"operates":[93],"only":[94],"over":[95],"class":[97,128],"token,":[98],"we":[99],"observe":[100],"that,":[101],"potentially":[102],"thanks":[103],"to":[104,126,166],"weight":[105],"sharing,":[106],"informative":[107],"musical":[108,145,176],"properties":[109],"emerge":[110,153],"ViT-1D's":[112],"sequence":[113,194],"tokens.":[114],"On":[115,130],"tasks,":[117,132],"their":[118],"temporal":[119],"average":[120],"offers":[121],"performance":[123],"boost":[124],"compared":[125],"token.":[129],"they":[133],"perform":[134],"unexpectedly":[135],"well,":[136],"despite":[137],"not":[138,164],"being":[139],"specifically":[140],"trained":[141],"for.":[142],"Furthermore,":[143],"high-level":[144],"features":[146],"onsets":[149],"changes":[152],"from":[154],"layerwise":[155],"self-similarity":[156],"matrices":[157],"attention":[159],"maps.":[160],"does":[163],"aim":[165],"outperform":[167],"state":[169],"art":[172],"but":[173],"advances":[174],"interpretation":[177],"transformers":[179,192],"sheds":[181],"light":[182],"on":[183],"some":[184],"overlooked":[185],"abilities":[186],"modeling":[195],"MIR.":[197]},"counts_by_year":[],"updated_date":"2025-11-06T04:12:42.849631","created_date":"2025-10-10T00:00:00"}
