{"id":"https://openalex.org/W3196747467","doi":"https://doi.org/10.1145/3460426.3463619","title":"MS-SincResNet: Joint Learning of 1D and 2D Kernels Using Multi-scale SincNet and ResNet for Music Genre Classification","display_name":"MS-SincResNet: Joint Learning of 1D and 2D Kernels Using Multi-scale SincNet and ResNet for Music Genre Classification","publication_year":2021,"publication_date":"2021-08-24","ids":{"openalex":"https://openalex.org/W3196747467","doi":"https://doi.org/10.1145/3460426.3463619","mag":"3196747467"},"language":"en","primary_location":{"id":"doi:10.1145/3460426.3463619","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3460426.3463619","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5018841433","display_name":"Pei-Chun Chang","orcid":"https://orcid.org/0000-0002-1181-9340"},"institutions":[{"id":"https://openalex.org/I148366613","display_name":"National Yang Ming Chiao Tung University","ror":"https://ror.org/00se2k293","country_code":"TW","type":"education","lineage":["https://openalex.org/I148366613"]}],"countries":["TW"],"is_corresponding":true,"raw_author_name":"Pei-Chun Chang","raw_affiliation_strings":["National Yang Ming Chiao Tung University, Hsinchu, Taiwan Roc"],"affiliations":[{"raw_affiliation_string":"National Yang Ming Chiao Tung University, Hsinchu, Taiwan Roc","institution_ids":["https://openalex.org/I148366613"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100739157","display_name":"Yong\u2010Sheng Chen","orcid":"https://orcid.org/0000-0002-5581-850X"},"institutions":[{"id":"https://openalex.org/I148366613","display_name":"National Yang Ming Chiao Tung University","ror":"https://ror.org/00se2k293","country_code":"TW","type":"education","lineage":["https://openalex.org/I148366613"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Yong-Sheng Chen","raw_affiliation_strings":["National Yang Ming Chiao Tung University, Hsinchu, Taiwan Roc"],"affiliations":[{"raw_affiliation_string":"National Yang Ming Chiao Tung University, Hsinchu, Taiwan Roc","institution_ids":["https://openalex.org/I148366613"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5010660600","display_name":"Chang-Hsing Lee","orcid":"https://orcid.org/0000-0002-5761-421X"},"institutions":[{"id":"https://openalex.org/I59460038","display_name":"Chung Hua University","ror":"https://ror.org/01yzz0f51","country_code":"TW","type":"education","lineage":["https://openalex.org/I59460038"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Chang-Hsing Lee","raw_affiliation_strings":["Chung Hua University, Hsinchu, Taiwan Roc"],"affiliations":[{"raw_affiliation_string":"Chung Hua University, Hsinchu, Taiwan Roc","institution_ids":["https://openalex.org/I59460038"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5018841433"],"corresponding_institution_ids":["https://openalex.org/I148366613"],"apc_list":null,"apc_paid":null,"fwci":2.6161,"has_fulltext":false,"cited_by_count":25,"citation_normalized_percentile":{"value":0.90564392,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"29","last_page":"36"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.991599977016449,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9912999868392944,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.8921717405319214},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.7777985334396362},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6888244152069092},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.6025186777114868},{"id":"https://openalex.org/keywords/pooling","display_name":"Pooling","score":0.5800029039382935},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.5750767588615417},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5710460543632507},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5096237659454346},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4717871844768524},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.43286603689193726}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.8921717405319214},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.7777985334396362},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6888244152069092},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.6025186777114868},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.5800029039382935},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.5750767588615417},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5710460543632507},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5096237659454346},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4717871844768524},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.43286603689193726},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3460426.3463619","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3460426.3463619","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2021 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.6700000166893005}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":41,"referenced_works":["https://openalex.org/W140785830","https://openalex.org/W1162301944","https://openalex.org/W1517388136","https://openalex.org/W1585977217","https://openalex.org/W1825426646","https://openalex.org/W1998176629","https://openalex.org/W2018083238","https://openalex.org/W2039739794","https://openalex.org/W2093231248","https://openalex.org/W2099760476","https://openalex.org/W2101252291","https://openalex.org/W2109255472","https://openalex.org/W2125324924","https://openalex.org/W2128196382","https://openalex.org/W2133824856","https://openalex.org/W2139435939","https://openalex.org/W2140539262","https://openalex.org/W2148748626","https://openalex.org/W2150279609","https://openalex.org/W2154221499","https://openalex.org/W2159561775","https://openalex.org/W2194775991","https://openalex.org/W2212050411","https://openalex.org/W2331517147","https://openalex.org/W2399733683","https://openalex.org/W2407513162","https://openalex.org/W2579586206","https://openalex.org/W2620629206","https://openalex.org/W2770454110","https://openalex.org/W2797211987","https://openalex.org/W2805966122","https://openalex.org/W2889391573","https://openalex.org/W2951019013","https://openalex.org/W2951111322","https://openalex.org/W2964052309","https://openalex.org/W2964218314","https://openalex.org/W3008707468","https://openalex.org/W3069400403","https://openalex.org/W3110962504","https://openalex.org/W3121574643","https://openalex.org/W3172007888"],"related_works":["https://openalex.org/W2530685530","https://openalex.org/W4375868962","https://openalex.org/W2088854863","https://openalex.org/W2011227383","https://openalex.org/W1976719989","https://openalex.org/W2942893872","https://openalex.org/W2065606036","https://openalex.org/W3179495260","https://openalex.org/W3127543252","https://openalex.org/W2810679507"],"abstract_inverted_index":{"In":[0,164],"this":[1,61],"study,":[2],"we":[3,168],"proposed":[4,172],"a":[5,54],"new":[6],"end-to-end":[7],"convolutional":[8,29],"neural":[9],"network,":[10],"called":[11],"MS-SincResNet,":[12],"for":[13],"music":[14,49,63,71,146,162],"genre":[15],"classification.":[16],"MS-SincResNet":[17,173,187],"appends":[18],"1D":[19,37,76],"multi-scale":[20],"SincNet":[21,177],"(MS-SincNet)":[22],"to":[23,34,81,109,126,139,154],"2D":[24,40,84,115,185],"ResNet":[25,105],"as":[26],"the":[27,43,66,128,141,149,156,171,175,193,197,201],"first":[28],"layer":[30],"in":[31,60,131],"an":[32,47],"attempt":[33],"jointly":[35],"learn":[36],"kernels":[38,41],"and":[39,65,93,103,136,178,200],"during":[42],"training":[44],"stage.":[45],"First,":[46],"input":[48],"signal":[50],"is":[51,73,106,123,152,206],"divided":[52],"into":[53,75],"number":[55],"of":[56,69,133,144],"fixed-duration":[57],"(3":[58],"seconds":[59],"study)":[62],"clips,":[64],"raw":[67],"waveform":[68],"each":[70,145],"clip":[72],"fed":[74],"MS-SincNet":[77],"filter":[78],"learning":[79],"module":[80,122],"obtain":[82,140],"three-channel":[83],"representations.":[85,116],"The":[86,117,204],"learned":[87],"representations":[88],"carry":[89],"rich":[90],"timbral,":[91],"harmonic,":[92],"percussive":[94,101],"characteristics":[95],"comparing":[96],"with":[97,192],"spectrograms,":[98,100],"harmonic":[99],"spectrograms":[102],"Mel-spectrograms.":[104],"then":[107],"used":[108,125],"extract":[110],"discriminative":[111],"embeddings":[112],"from":[113,159],"these":[114],"spatial":[118],"pyramid":[119],"pooling":[120],"(SPP)":[121],"further":[124],"enhance":[127],"feature":[129],"discriminability,":[130],"terms":[132],"both":[134],"time":[135],"frequency":[137],"aspects,":[138],"classification":[142,157],"label":[143],"clip.":[147],"Finally,":[148],"voting":[150],"strategy":[151],"applied":[153],"summarize":[155],"results":[158,191],"all":[160],"3-second":[161],"clips.":[163],"our":[165],"experimental":[166],"results,":[167],"demonstrate":[169],"that":[170],"outperforms":[174],"baseline":[176],"many":[179],"well-known":[180],"hand-crafted":[181],"features.":[182],"Considering":[183],"individual":[184],"representation,":[186],"also":[188],"yields":[189],"competitive":[190],"state-of-the-art":[194],"methods":[195],"on":[196],"GTZAN":[198],"dataset":[199],"ISMIR2004":[202],"dataset.":[203],"code":[205],"available":[207],"at":[208],"https://github.com/PeiChunChang/MS-SincResNet.":[209]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":10},{"year":2022,"cited_by_count":2}],"updated_date":"2026-03-29T08:15:47.926485","created_date":"2025-10-10T00:00:00"}
