{"id":"https://openalex.org/W6968480076","doi":"https://doi.org/10.5281/zenodo.14877280","title":"X-Cover: Better Music Version Identification System by Integrating Pretrained ASR Model","display_name":"X-Cover: Better Music Version Identification System by Integrating Pretrained ASR Model","publication_year":2024,"publication_date":"2024-11-10","ids":{"openalex":"https://openalex.org/W6968480076","doi":"https://doi.org/10.5281/zenodo.14877280"},"language":"en","primary_location":{"id":"doi:10.5281/zenodo.14877280","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.14877280","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.5281/zenodo.14877280","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Xingjian Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xingjian Du","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Mingyu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mingyu Liu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Pei Zou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pei Zou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Xia Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xia Liang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zijie Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zijie Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Huidong Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huidong Liang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Bilei Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bilei Zhu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.3559,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.63231168,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9853000044822693,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9853000044822693,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.0017999999690800905,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.0006000000284984708,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.5440999865531921},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.49790000915527344},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.4580000042915344},{"id":"https://openalex.org/keywords/music-information-retrieval","display_name":"Music information retrieval","score":0.43209999799728394},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.4277999997138977},{"id":"https://openalex.org/keywords/dimensionality-reduction","display_name":"Dimensionality reduction","score":0.3953999876976013},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.3808000087738037},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.375900000333786}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7893000245094299},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6610999703407288},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5695000290870667},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.5440999865531921},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.49790000915527344},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.4580000042915344},{"id":"https://openalex.org/C2777946086","wikidata":"https://www.wikidata.org/wiki/Q1163335","display_name":"Music information retrieval","level":3,"score":0.43209999799728394},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.4277999997138977},{"id":"https://openalex.org/C70518039","wikidata":"https://www.wikidata.org/wiki/Q16000077","display_name":"Dimensionality reduction","level":2,"score":0.3953999876976013},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3808000087738037},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.375900000333786},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3614000082015991},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3578999936580658},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.3538999855518341},{"id":"https://openalex.org/C111030470","wikidata":"https://www.wikidata.org/wiki/Q1430460","display_name":"Curse of dimensionality","level":2,"score":0.33399999141693115},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3140999972820282},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.31360000371932983},{"id":"https://openalex.org/C100675267","wikidata":"https://www.wikidata.org/wiki/Q1371624","display_name":"Background noise","level":2,"score":0.29989999532699585},{"id":"https://openalex.org/C204241405","wikidata":"https://www.wikidata.org/wiki/Q461499","display_name":"Transformation (genetics)","level":3,"score":0.2915000021457672},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2825999855995178},{"id":"https://openalex.org/C151989614","wikidata":"https://www.wikidata.org/wiki/Q440370","display_name":"Mel-frequency cepstrum","level":3,"score":0.27900001406669617},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.274399995803833},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2700999975204468},{"id":"https://openalex.org/C2780428219","wikidata":"https://www.wikidata.org/wiki/Q16952335","display_name":"Cover (algebra)","level":2,"score":0.26980000734329224},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.25999999046325684}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5281/zenodo.14877280","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.14877280","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":""}],"best_oa_location":{"id":"doi:10.5281/zenodo.14877280","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.14877280","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Methods":[0],"based":[1],"on":[2],"deep":[3],"learning":[4],"have":[5,25],"emerged":[6],"as":[7],"a":[8,98,124,153,168,176],"dominant":[9],"approach":[10,94],"for":[11],"cover":[12],"song":[13],"identification":[14],"(CSI)":[15],"literature":[16],"over":[17],"the":[18,35,53,87,118,144,148,159,184],"past":[19],"years,":[20],"among":[21],"which":[22,156],"ByteCover":[23],"systems":[24],"consistently":[26],"delivered":[27],"state-of-the-art":[28],"performance":[29,89,195],"across":[30,196],"major":[31],"CSI":[32,91,120],"datasets":[33],"in":[34,152],"field.":[36],"Despite":[37],"its":[38],"steady":[39],"improvements":[40],"along":[41],"previous":[42,149],"generations":[43],"from":[44,71,109,172],"audio":[45],"feature":[46],"dimensionality":[47],"reduction":[48],"to":[49,57,60,105,132,166,183,192],"short":[50],"query":[51],"identification,":[52],"system":[54,121,151],"is":[55,180],"found":[56],"be":[58],"vulnerable":[59],"audios":[61],"with":[62],"noise":[63],"and":[64,134,147],"ambiguous":[65],"melody":[66],"when":[67],"extracting":[68],"musical":[69],"information":[70,164],"constant-Q":[72],"transformation":[73],"(CQT)":[74],"spectrograms.":[75],"Although":[76],"some":[77],"recent":[78],"studies":[79],"suggest":[80],"that":[81,122],"incorporating":[82],"lyric-related":[83,107],"features":[84,108,137],"can":[85],"enhance":[86],"overall":[88],"of":[90,161,178,186],"systems,":[92],"this":[93,113,187],"typically":[95],"requires":[96],"training":[97,167,185],"separate":[99],"automatic":[100,126],"lyric":[101,163],"recognition":[102,128],"(ALR)":[103],"model":[104,171],"extract":[106,133],"music":[110],"recordings.":[111],"In":[112,174],"work,":[114],"we":[115,141],"introduce":[116],"X-Cover,":[117],"latest":[119],"incorporates":[123],"pre-trained":[125],"speech":[127],"(ASR)":[129],"module,":[130],"Whisper,":[131],"integrate":[135],"lyrics-related":[136],"into":[138],"modelling.":[139],"Specifically,":[140],"jointly":[142],"fine-tune":[143],"ASR":[145],"block":[146],"ByteCover3":[150],"parameter-efficient":[154],"fashion,":[155],"largely":[157],"reduces":[158],"cost":[160],"using":[162],"compared":[165],"new":[169,188],"ALR":[170],"scratch.":[173],"addition,":[175],"bag":[177],"tricks":[179],"further":[181],"applied":[182],"generation,":[189],"assisting":[190],"X-Cover":[191],"achieve":[193],"strong":[194],"various":[197],"datasets.":[198]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-24T08:02:53.985720","created_date":"2025-10-10T00:00:00"}
