{"id":"https://openalex.org/W7131683335","doi":"https://doi.org/10.1109/lsp.2026.3668742","title":"CSTNet: A Cross-Scale Transformer Network for Singing Melody Extraction","display_name":"CSTNet: A Cross-Scale Transformer Network for Singing Melody Extraction","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7131683335","doi":"https://doi.org/10.1109/lsp.2026.3668742"},"language":null,"primary_location":{"id":"doi:10.1109/lsp.2026.3668742","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2026.3668742","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100333011","display_name":"Jie Chen","orcid":"https://orcid.org/0000-0003-3435-0351"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jianbin Chen","raw_affiliation_strings":["School of Computer Science and Engineering, South China University of Technology, Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102873151","display_name":"Hao Xu","orcid":"https://orcid.org/0000-0003-0885-7343"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Xu","raw_affiliation_strings":["School of Computer Science and Engineering, South China University of Technology, Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126888894","display_name":"Zhengwei Peng","orcid":null},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhengwei Peng","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-Sen University, Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-Sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100428136","display_name":"Yanfang Li","orcid":"https://orcid.org/0000-0002-1487-365X"},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yue Li","raw_affiliation_strings":["School of Computer Science and Engineering, South China University of Technology, Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, South China University of Technology, Guangzhou, China","institution_ids":["https://openalex.org/I90610280"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5100333011"],"corresponding_institution_ids":["https://openalex.org/I90610280"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.64540541,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"33","issue":null,"first_page":"1240","last_page":"1244"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.8971999883651733,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.8971999883651733,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.020400000736117363,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.017500000074505806,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.6643999814987183},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6391000151634216},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.5317999720573425},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.49810001254081726},{"id":"https://openalex.org/keywords/singing","display_name":"Singing","score":0.49790000915527344},{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.47600001096725464},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4636000096797943},{"id":"https://openalex.org/keywords/octave","display_name":"Octave (electronics)","score":0.45509999990463257}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7906000018119812},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.6643999814987183},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6391000151634216},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5533999800682068},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.5317999720573425},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5232999920845032},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.49810001254081726},{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.49790000915527344},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.47600001096725464},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4636000096797943},{"id":"https://openalex.org/C85841341","wikidata":"https://www.wikidata.org/wiki/Q1135984","display_name":"Octave (electronics)","level":2,"score":0.45509999990463257},{"id":"https://openalex.org/C128979739","wikidata":"https://www.wikidata.org/wiki/Q179465","display_name":"Polyphony","level":2,"score":0.4456999897956848},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.3653999865055084},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.33880001306533813},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.33180001378059387},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.3102000057697296},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.27630001306533813},{"id":"https://openalex.org/C9940772","wikidata":"https://www.wikidata.org/wiki/Q557399","display_name":"Psychoacoustics","level":3,"score":0.2743000090122223},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.27309998869895935},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2718999981880188},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.26600000262260437}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lsp.2026.3668742","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2026.3668742","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Singing":[0],"melody":[1],"extraction":[2,24],"from":[3],"polyphonic":[4],"music":[5,13],"is":[6],"a":[7,51,99],"complex":[8],"but":[9],"important":[10],"task":[11],"in":[12,31],"information":[14],"retrieval.":[15],"Most":[16],"existing":[17],"models":[18],"struggle":[19],"with":[20],"adaptive":[21,80],"multi-scale":[22,61],"feature":[23,82],"and":[25,38,59,63,86,123,131],"long-range":[26],"dependency":[27],"modeling,":[28],"often":[29],"resulting":[30],"common":[32],"artifacts":[33],"such":[34],"as":[35],"octave":[36,143],"errors":[37],"pitch":[39],"fragmentation.":[40],"To":[41],"address":[42],"this,":[43],"we":[44],"propose":[45],"the":[46,72,126],"Cross-Scale":[47],"Transformer":[48,89],"Network":[49],"(CSTNet),":[50],"novel":[52],"deep":[53],"learning":[54],"architecture":[55],"that":[56,107,139],"dynamically":[57],"captures":[58],"fuses":[60],"temporal":[62],"spectral":[64],"features.":[65],"The":[66],"CSTNet":[67,140],"introduces":[68],"two":[69],"key":[70],"components:":[71],"Multi":[73],"scale":[74,96],"Time-Frequency":[75],"Aggregation":[76],"(MF-TFA)":[77],"module":[78],"for":[79],"local":[81],"representation":[83],"across":[84],"resolutions,":[85],"Feature":[87],"Cross":[88],"Block":[90],"(FCTB),":[91],"which":[92],"enables":[93],"explicit":[94],"cross":[95],"interaction":[97],"through":[98],"global":[100],"channel-wise":[101],"cross-attention":[102],"mechanism.":[103],"Experimental":[104],"results":[105,137],"show":[106],"our":[108],"proposed":[109],"method":[110],"outperforms":[111],"five":[112],"compared":[113],"state-of-the-art":[114],"methods,":[115],"achieving":[116],"overall":[117],"accuracy":[118],"scores":[119],"of":[120],"87.3%,":[121],"90.5%,":[122],"77.4%":[124],"on":[125],"ADC":[127],"2004,":[128],"MIREX":[129],"05,":[130],"MEDLEY":[132],"DB":[133],"datasets,":[134],"respectively.":[135],"Visualization":[136],"demonstrate":[138],"effectively":[141],"reduces":[142],"errors.":[144]},"counts_by_year":[],"updated_date":"2026-03-21T06:30:42.041108","created_date":"2026-02-27T00:00:00"}
