{"id":"https://openalex.org/W7108725572","doi":"https://doi.org/10.5281/zenodo.17811470","title":"Assessing the Alignment of Audio Representations With Timbre Similarity Ratings","display_name":"Assessing the Alignment of Audio Representations With Timbre Similarity Ratings","publication_year":2025,"publication_date":"2025-09-21","ids":{"openalex":"https://openalex.org/W7108725572","doi":"https://doi.org/10.5281/zenodo.17811470"},"language":null,"primary_location":{"id":"doi:10.5281/zenodo.17811470","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811470","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.5281/zenodo.17811470","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Haokun Tian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Haokun Tian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Stefan Lattner","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stefan Lattner","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Charalampos Saitis","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Charalampos Saitis","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.54923458,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9007999897003174,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9007999897003174,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.03449999913573265,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.014299999922513962,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/timbre","display_name":"Timbre","score":0.7626000046730042},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.6575000286102295},{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.6037999987602234},{"id":"https://openalex.org/keywords/mel-frequency-cepstrum","display_name":"Mel-frequency cepstrum","score":0.5511000156402588},{"id":"https://openalex.org/keywords/multidimensional-scaling","display_name":"Multidimensional scaling","score":0.5397999882698059},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5131999850273132},{"id":"https://openalex.org/keywords/sound-quality","display_name":"Sound quality","score":0.4350000023841858},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.41690000891685486},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.41290000081062317},{"id":"https://openalex.org/keywords/categorization","display_name":"Categorization","score":0.4106000065803528}],"concepts":[{"id":"https://openalex.org/C2776539107","wikidata":"https://www.wikidata.org/wiki/Q176501","display_name":"Timbre","level":3,"score":0.7626000046730042},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7279000282287598},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.6575000286102295},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.6037999987602234},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6029999852180481},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5557000041007996},{"id":"https://openalex.org/C151989614","wikidata":"https://www.wikidata.org/wiki/Q440370","display_name":"Mel-frequency cepstrum","level":3,"score":0.5511000156402588},{"id":"https://openalex.org/C91682802","wikidata":"https://www.wikidata.org/wiki/Q620538","display_name":"Multidimensional scaling","level":2,"score":0.5397999882698059},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5131999850273132},{"id":"https://openalex.org/C167310288","wikidata":"https://www.wikidata.org/wiki/Q7564808","display_name":"Sound quality","level":2,"score":0.4350000023841858},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.41690000891685486},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.41290000081062317},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.4106000065803528},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4011000096797943},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.3955000042915344},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3889999985694885},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.36730000376701355},{"id":"https://openalex.org/C128422554","wikidata":"https://www.wikidata.org/wiki/Q20077126","display_name":"Sound recording and reproduction","level":2,"score":0.3571000099182129},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.35100001096725464},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.3483000099658966},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3352000117301941},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3294999897480011},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.3255999982357025},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.3124000132083893},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.30889999866485596},{"id":"https://openalex.org/C2777946086","wikidata":"https://www.wikidata.org/wiki/Q1163335","display_name":"Music information retrieval","level":3,"score":0.29660001397132874},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.28769999742507935},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.27320000529289246},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.27000001072883606},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2612999975681305},{"id":"https://openalex.org/C143271835","wikidata":"https://www.wikidata.org/wiki/Q254515","display_name":"Similitude","level":2,"score":0.259799987077713},{"id":"https://openalex.org/C9940772","wikidata":"https://www.wikidata.org/wiki/Q557399","display_name":"Psychoacoustics","level":3,"score":0.2535000145435333}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5281/zenodo.17811470","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811470","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":""}],"best_oa_location":{"id":"doi:10.5281/zenodo.17811470","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811470","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Psychoacoustical":[0],"so-called":[1],"\"timbre":[2],"spaces\"":[3],"map":[4],"perceptual":[5],"similarity":[6,40,114],"ratings":[7,80],"of":[8,25,105,112,124],"instrument":[9],"sounds":[10],"onto":[11],"low-dimensional":[12],"embeddings":[13,48,152,176],"via":[14],"multidimensional":[15],"scaling":[16],"but":[17],"suffer":[18],"from":[19,29,59,177],"scalability":[20],"issues":[21],"and":[22,32,88,121,141,165,173],"are":[23,163],"incapable":[24],"generalization.":[26],"Recent":[27],"results":[28],"audio":[30,83,93,107],"(music":[31],"speech)":[33],"quality":[34],"assessment":[35],"as":[36,38],"well":[37,51],"image":[39],"have":[41],"shown":[42],"that":[43,49,170],"deep":[44,74,194],"learning":[45],"provides":[46],"emergent":[47],"align":[50],"with":[52,109],"human":[53,110,128],"perception":[54],"while":[55],"being":[56],"largely":[57],"free":[58],"these":[60],"constraints.":[61],"Although":[62],"the":[63,103,118,122,155,160,174],"existing":[64,92],"'timbre":[65],"space'":[66],"data":[67],"is":[68,86],"not":[69],"large":[70],"enough":[71],"to":[72,101,127,197],"train":[73],"neural":[75],"networks":[76],"(only":[77],"2,614":[78],"pairwise":[79],"on":[81],"334":[82],"samples),":[84],"it":[85],"sufficient":[87],"suitable":[89],"for":[90],"evaluating":[91],"models.":[94],"In":[95],"this":[96],"paper,":[97],"we":[98],"introduce":[99],"metrics":[100],"assess":[102],"alignment":[104],"diverse":[106],"representations":[108,149],"judgements":[111],"timbre":[113,199],"by":[115,154],"comparing":[116],"both":[117],"absolute":[119],"values":[120],"rankings":[123],"embedding":[125],"distances":[126],"dissimilarity":[129],"ratings.":[130],"Our":[131,167],"evaluation":[132],"involves":[133],"3":[134],"signal-processing":[135],"based":[136],"methods,":[137],"10":[138],"pretrained":[139],"models,":[140],"a":[142],"novel":[143],"sound":[144,179],"matching":[145,180],"model":[146,181],"where":[147],"three":[148],"(including":[150],"'style'":[151],"inspired":[153],"style":[156,175],"transfer":[157],"task":[158],"in":[159,192],"vision":[161],"domain)":[162],"extracted":[164],"evaluated.":[166],"analysis":[168],"reveals":[169],"CLAP-based":[171],"models":[172],"our":[178],"achieve":[182],"marginal":[183],"gains":[184],"over":[185],"alternatives,":[186],"yet":[187],"MFCC":[188],"remains":[189],"competitive\u2014underscoring":[190],"gaps":[191],"current":[193],"features'":[195],"ability":[196],"encode":[198],"similarity.":[200]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-12-05T00:00:00"}
