{"id":"https://openalex.org/W3190413508","doi":"https://doi.org/10.1109/icassp39728.2021.9414431","title":"An Hrnet-Blstm Model With Two-Stage Training For Singing Melody Extraction","display_name":"An Hrnet-Blstm Model With Two-Stage Training For Singing Melody Extraction","publication_year":2021,"publication_date":"2021-05-13","ids":{"openalex":"https://openalex.org/W3190413508","doi":"https://doi.org/10.1109/icassp39728.2021.9414431","mag":"3190413508"},"language":"en","primary_location":{"id":"doi:10.1109/icassp39728.2021.9414431","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp39728.2021.9414431","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5051966001","display_name":"Yongwei Gao","orcid":"https://orcid.org/0000-0001-7026-9273"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yongwei Gao","raw_affiliation_strings":["School of Computer Science and Technology, Fudan University, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Fudan University, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091528537","display_name":"Xingjian Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xingjian Du","raw_affiliation_strings":["ByteDance AI Lab, China"],"affiliations":[{"raw_affiliation_string":"ByteDance AI Lab, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100934338","display_name":"Bilei Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bilei Zhu","raw_affiliation_strings":["ByteDance AI Lab, China"],"affiliations":[{"raw_affiliation_string":"ByteDance AI Lab, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064077063","display_name":"Xiaoheng Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoheng Sun","raw_affiliation_strings":["School of Computer Science and Technology, Fudan University, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Fudan University, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102918058","display_name":"Wei Li","orcid":"https://orcid.org/0000-0002-6883-8761"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Li","raw_affiliation_strings":["School of Computer Science and Technology, Fudan University, China","Shanghai Key Laboratory of Intelligent Information Processing, Fudan University, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Fudan University, China","institution_ids":["https://openalex.org/I24943067"]},{"raw_affiliation_string":"Shanghai Key Laboratory of Intelligent Information Processing, Fudan University, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5110851569","display_name":"Zejun Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zejun Ma","raw_affiliation_strings":["ByteDance AI Lab, China"],"affiliations":[{"raw_affiliation_string":"ByteDance AI Lab, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5051966001"],"corresponding_institution_ids":["https://openalex.org/I24943067"],"apc_list":null,"apc_paid":null,"fwci":0.1524,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.43427992,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"56","last_page":"60"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9937999844551086,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8254998922348022},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5871585011482239},{"id":"https://openalex.org/keywords/semitone","display_name":"Semitone","score":0.5669326186180115},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.557068407535553},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5192692279815674},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5176877975463867},{"id":"https://openalex.org/keywords/pitch-detection-algorithm","display_name":"Pitch detection algorithm","score":0.47045671939849854},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4659591317176819},{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.4653379023075104},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.44902950525283813},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.4453471899032593},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.4407433271408081},{"id":"https://openalex.org/keywords/term","display_name":"Term (time)","score":0.43455129861831665},{"id":"https://openalex.org/keywords/voice","display_name":"Voice","score":0.4318872094154358},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.43116217851638794},{"id":"https://openalex.org/keywords/layer","display_name":"Layer (electronics)","score":0.4298456013202667},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.22432982921600342}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8254998922348022},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5871585011482239},{"id":"https://openalex.org/C79035858","wikidata":"https://www.wikidata.org/wiki/Q738785","display_name":"Semitone","level":2,"score":0.5669326186180115},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.557068407535553},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5192692279815674},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5176877975463867},{"id":"https://openalex.org/C135622632","wikidata":"https://www.wikidata.org/wiki/Q7198851","display_name":"Pitch detection algorithm","level":3,"score":0.47045671939849854},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4659591317176819},{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.4653379023075104},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.44902950525283813},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.4453471899032593},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.4407433271408081},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.43455129861831665},{"id":"https://openalex.org/C552089266","wikidata":"https://www.wikidata.org/wiki/Q494510","display_name":"Voice","level":2,"score":0.4318872094154358},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.43116217851638794},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.4298456013202667},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.22432982921600342},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp39728.2021.9414431","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp39728.2021.9414431","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5899999737739563}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1976069042","https://openalex.org/W2161632835","https://openalex.org/W2194775991","https://openalex.org/W2296724634","https://openalex.org/W2407685581","https://openalex.org/W2571951934","https://openalex.org/W2773294482","https://openalex.org/W2902480954","https://openalex.org/W2916798096","https://openalex.org/W2932319787","https://openalex.org/W2937087044","https://openalex.org/W2963535133","https://openalex.org/W2964121744","https://openalex.org/W2964177567","https://openalex.org/W3014641072","https://openalex.org/W3049401840","https://openalex.org/W3150201805","https://openalex.org/W6697040288","https://openalex.org/W6714030504","https://openalex.org/W6731524272","https://openalex.org/W6746836464","https://openalex.org/W6756648248","https://openalex.org/W6782211628"],"related_works":["https://openalex.org/W116452896","https://openalex.org/W2117854015","https://openalex.org/W2044564457","https://openalex.org/W1548784694","https://openalex.org/W2904981233","https://openalex.org/W2111139066","https://openalex.org/W2387387595","https://openalex.org/W284503395","https://openalex.org/W2085186359","https://openalex.org/W41906912"],"abstract_inverted_index":{"Well-labeled":[0],"datasets":[1,160],"available":[2],"for":[3,52,61,77,89,95],"melody":[4,39,96],"extraction":[5],"are":[6],"scarce,":[7],"which":[8],"limits":[9],"the":[10,32,56,62,104,121,133,151,163],"further":[11],"advancement":[12],"of":[13,47,65,72,81,124],"deep":[14],"learning":[15],"based":[16],"methods.":[17],"To":[18],"overcome":[19],"this":[20],"problem,":[21],"we":[22],"propose":[23],"to":[24,30,42,119,138,143,149],"use":[25],"a":[26,44,69,78,110,128],"pitch":[27,34,58,144],"refinement":[28],"method":[29],"refine":[31],"semitone-level":[33],"sequences":[35],"decoded":[36],"from":[37],"massive":[38],"MIDI":[40],"files":[41],"generate":[43],"large":[45],"number":[46],"fundamental":[48],"frequency":[49],"(F0)":[50],"values":[51,59],"model":[53],"training.":[54,82],"Since":[55],"refined":[57],"used":[60,76,118],"first":[63],"round":[64,80],"training":[66],"contain":[67],"errors,":[68],"small":[70],"set":[71],"well-labeled":[73],"data":[74],"is":[75,93,117,146],"second":[79],"A":[83],"high-resolution":[84],"network":[85],"(HRNet),":[86],"initially":[87],"developed":[88],"human":[90],"pose":[91],"estimation,":[92],"introduced":[94],"extraction.":[97],"It":[98],"considers":[99],"multi-resolution":[100],"feature":[101],"learning,":[102],"making":[103],"resulting":[105],"representation":[106],"semantically":[107],"richer.":[108],"Subsequently,":[109],"bidirectional":[111],"long":[112],"short-term":[113],"memory":[114],"(BLSTM)":[115],"layer":[116],"exploit":[120],"temporal":[122],"information":[123],"melody.":[125],"In":[126],"addition,":[127],"new":[129],"loss":[130],"function":[131],"where":[132],"unvoiced":[134],"frames":[135],"only":[136],"contribute":[137],"voicing":[139],"detection":[140],"but":[141],"not":[142],"classification":[145],"also":[147],"proposed":[148,164],"alleviate":[150],"class":[152],"imbalance":[153],"problem.":[154],"Experiment":[155],"results":[156],"on":[157],"three":[158],"public":[159],"show":[161],"that":[162],"system":[165],"outperforms":[166],"four":[167],"state-of-the-art":[168],"algorithms":[169],"in":[170],"most":[171],"cases.":[172]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
