{"id":"https://openalex.org/W4221165769","doi":"https://doi.org/10.1109/icassp43922.2022.9747304","title":"Tonet: Tone-Octave Network for Singing Melody Extraction from Polyphonic Music","display_name":"Tonet: Tone-Octave Network for Singing Melody Extraction from Polyphonic Music","publication_year":2022,"publication_date":"2022-04-27","ids":{"openalex":"https://openalex.org/W4221165769","doi":"https://doi.org/10.1109/icassp43922.2022.9747304"},"language":"en","primary_location":{"id":"doi:10.1109/icassp43922.2022.9747304","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747304","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100451980","display_name":"Ke Chen","orcid":"https://orcid.org/0000-0001-8357-3741"},"institutions":[{"id":"https://openalex.org/I36258959","display_name":"University of California, San Diego","ror":"https://ror.org/0168r3w48","country_code":"US","type":"education","lineage":["https://openalex.org/I36258959"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ke Chen","raw_affiliation_strings":["University of California San Diego"],"affiliations":[{"raw_affiliation_string":"University of California San Diego","institution_ids":["https://openalex.org/I36258959"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100548236","display_name":"Shuai Yu","orcid":"https://orcid.org/0009-0000-4195-5522"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuai Yu","raw_affiliation_strings":["Fudan University"],"affiliations":[{"raw_affiliation_string":"Fudan University","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066503076","display_name":"Cheng-i Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng-i Wang","raw_affiliation_strings":["Smule Inc"],"affiliations":[{"raw_affiliation_string":"Smule Inc","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100318193","display_name":"Wei Li","orcid":"https://orcid.org/0000-0002-4486-8341"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Li","raw_affiliation_strings":["Fudan University"],"affiliations":[{"raw_affiliation_string":"Fudan University","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017455302","display_name":"Taylor Berg-Kirkpatrick","orcid":"https://orcid.org/0000-0002-1283-4075"},"institutions":[{"id":"https://openalex.org/I36258959","display_name":"University of California, San Diego","ror":"https://ror.org/0168r3w48","country_code":"US","type":"education","lineage":["https://openalex.org/I36258959"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Taylor Berg-Kirkpatrick","raw_affiliation_strings":["University of California San Diego"],"affiliations":[{"raw_affiliation_string":"University of California San Diego","institution_ids":["https://openalex.org/I36258959"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5042278809","display_name":"Shlomo Dubnov","orcid":"https://orcid.org/0000-0003-0222-1125"},"institutions":[{"id":"https://openalex.org/I36258959","display_name":"University of California, San Diego","ror":"https://ror.org/0168r3w48","country_code":"US","type":"education","lineage":["https://openalex.org/I36258959"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shlomo Dubnov","raw_affiliation_strings":["University of California San Diego"],"affiliations":[{"raw_affiliation_string":"University of California San Diego","institution_ids":["https://openalex.org/I36258959"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100451980"],"corresponding_institution_ids":["https://openalex.org/I36258959"],"apc_list":null,"apc_paid":null,"fwci":2.5774,"has_fulltext":false,"cited_by_count":22,"citation_normalized_percentile":{"value":0.9206951,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"621","last_page":"625"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/octave","display_name":"Octave (electronics)","score":0.7619495391845703},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7124492526054382},{"id":"https://openalex.org/keywords/tone","display_name":"Tone (literature)","score":0.6506953239440918},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6480126976966858},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.6283212304115295},{"id":"https://openalex.org/keywords/salience","display_name":"Salience (neuroscience)","score":0.55619215965271},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.5114983320236206},{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.5081667900085449},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4942016899585724},{"id":"https://openalex.org/keywords/auditory-scene-analysis","display_name":"Auditory scene analysis","score":0.49266940355300903},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4248621463775635},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.34256404638290405},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.2677702307701111},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.2524099349975586}],"concepts":[{"id":"https://openalex.org/C85841341","wikidata":"https://www.wikidata.org/wiki/Q1135984","display_name":"Octave (electronics)","level":2,"score":0.7619495391845703},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7124492526054382},{"id":"https://openalex.org/C2780583480","wikidata":"https://www.wikidata.org/wiki/Q1366327","display_name":"Tone (literature)","level":2,"score":0.6506953239440918},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6480126976966858},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.6283212304115295},{"id":"https://openalex.org/C108154423","wikidata":"https://www.wikidata.org/wiki/Q1469792","display_name":"Salience (neuroscience)","level":2,"score":0.55619215965271},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.5114983320236206},{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.5081667900085449},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4942016899585724},{"id":"https://openalex.org/C38129911","wikidata":"https://www.wikidata.org/wiki/Q4820038","display_name":"Auditory scene analysis","level":3,"score":0.49266940355300903},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4248621463775635},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.34256404638290405},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.2677702307701111},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.2524099349975586},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C124952713","wikidata":"https://www.wikidata.org/wiki/Q8242","display_name":"Literature","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp43922.2022.9747304","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747304","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.550000011920929,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1978423151","https://openalex.org/W2059020775","https://openalex.org/W2134025364","https://openalex.org/W2161632835","https://openalex.org/W2194775991","https://openalex.org/W2296724634","https://openalex.org/W2407685581","https://openalex.org/W2410438028","https://openalex.org/W2571951934","https://openalex.org/W2773294482","https://openalex.org/W2902480954","https://openalex.org/W2932319787","https://openalex.org/W2938109705","https://openalex.org/W2963535133","https://openalex.org/W2964177567","https://openalex.org/W3005441616","https://openalex.org/W3015608758","https://openalex.org/W3047453285","https://openalex.org/W3101039626","https://openalex.org/W3158472478","https://openalex.org/W3163379884","https://openalex.org/W3191088441","https://openalex.org/W4224920338","https://openalex.org/W4224941193","https://openalex.org/W4226151502","https://openalex.org/W4385245566","https://openalex.org/W6631190155","https://openalex.org/W6697040288","https://openalex.org/W6714030504","https://openalex.org/W6731524272","https://openalex.org/W6739901393","https://openalex.org/W6746836464","https://openalex.org/W6756648248","https://openalex.org/W6781553146","https://openalex.org/W6794325019"],"related_works":["https://openalex.org/W2530685530","https://openalex.org/W4375868962","https://openalex.org/W2088854863","https://openalex.org/W2011227383","https://openalex.org/W1976719989","https://openalex.org/W2942893872","https://openalex.org/W2065606036","https://openalex.org/W3179495260","https://openalex.org/W1547433061","https://openalex.org/W2074549717"],"abstract_inverted_index":{"Singing":[0],"melody":[1,39],"extraction":[2,162],"is":[3,103],"an":[4,81,99,116],"important":[5],"problem":[6],"in":[7,35,171],"the":[8,23,36,85,129,139,159],"field":[9],"of":[10,38,94,141],"music":[11],"information":[12,40],"retrieval.":[13],"Existing":[14],"methods":[15],"typically":[16],"rely":[17],"on":[18],"frequency-domain":[19],"representations":[20],"to":[21,32,105,127,137],"estimate":[22],"sung":[24],"frequencies.":[25],"However,":[26],"this":[27,48],"design":[28],"does":[29],"not":[30],"lead":[31],"human-level":[33],"performance":[34,163],"perception":[37],"for":[41],"both":[42,62],"tone":[43,63,112,174],"(pitch-class)":[44],"and":[45,64,73,115,173],"octave.":[46],"In":[47],"paper,":[49],"we":[50,79,97,121],"propose":[51,122],"TONet":[52,142],"<sup":[53],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[54],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[55],",":[56],"a":[57,69,74,92,107,111,123],"plug-and-play":[58],"model":[59],"that":[60,87,102,151],"improves":[61],"octave":[65,117,172],"perceptions":[66],"by":[67],"leveraging":[68],"novel":[70,75],"input":[71,83],"representation":[72],"network":[76],"architecture.":[77],"First,":[78],"present":[80],"improved":[82],"representation,":[84],"Tone-CFP,":[86],"explicitly":[88],"groups":[89],"harmonics":[90],"via":[91],"rearrangement":[93],"frequency-bins.":[95],"Second,":[96],"introduce":[98],"encoder-decoder":[100],"architecture":[101],"designed":[104],"obtain":[106],"salience":[108,131],"feature":[109,113,118,132],"map,":[110,114],"map.":[119,133],"Third,":[120],"tone-octave":[124,152],"fusion":[125,153],"mechanism":[126],"improve":[128,158],"final":[130],"Experiments":[134],"are":[135],"done":[136],"verify":[138],"capability":[140],"with":[143,154,168],"various":[144,165],"baseline":[145],"backbone":[146],"models.":[147],"Our":[148],"results":[149],"show":[150],"Tone-CFP":[155],"can":[156],"significantly":[157],"singing":[160],"voice":[161],"across":[164],"datasets":[166],"\u2013":[167],"substantial":[169],"gains":[170],"accuracy.":[175]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
