{"id":"https://openalex.org/W4408354601","doi":"https://doi.org/10.1109/icassp49660.2025.10890561","title":"MQAD: A Large-Scale Question Answering Dataset for Training Music Large Language Models","display_name":"MQAD: A Large-Scale Question Answering Dataset for Training Music Large Language Models","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408354601","doi":"https://doi.org/10.1109/icassp49660.2025.10890561"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10890561","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890561","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5046321739","display_name":"Zhihao Ouyang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhihao Ouyang","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065288078","display_name":"Ju-Chiang Wang","orcid":"https://orcid.org/0009-0002-8265-4229"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ju-Chiang Wang","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051023034","display_name":"D. ZHANG","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Daiyu Zhang","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Bin Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bin Chen","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101623196","display_name":"Shangjie Li","orcid":"https://orcid.org/0009-0006-0070-523X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shangjie Li","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101063208","display_name":"Quan Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Quan Lin","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5046321739"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.4851,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.77990901,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9781000018119812,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13996","display_name":"Diverse Musicological Studies","score":0.9739999771118164,"subfield":{"id":"https://openalex.org/subfields/1210","display_name":"Music"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7711201906204224},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.652628481388092},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.5887448191642761},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5708482265472412},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5218290686607361},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.49881958961486816},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4886496365070343},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.45821091532707214},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.05326524376869202}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7711201906204224},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.652628481388092},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.5887448191642761},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5708482265472412},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5218290686607361},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.49881958961486816},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4886496365070343},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.45821091532707214},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.05326524376869202},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10890561","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890561","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6200000047683716,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W2592168896","https://openalex.org/W2945761034","https://openalex.org/W4224918587","https://openalex.org/W4224944643","https://openalex.org/W4312380001","https://openalex.org/W4372260308","https://openalex.org/W4372260310","https://openalex.org/W4389344855","https://openalex.org/W4389518939","https://openalex.org/W4389519254","https://openalex.org/W4389523849","https://openalex.org/W4392909390","https://openalex.org/W4400033239","https://openalex.org/W6633499030","https://openalex.org/W6770607572","https://openalex.org/W6796581206","https://openalex.org/W6803003856","https://openalex.org/W6838461927","https://openalex.org/W6846556436","https://openalex.org/W6846848840","https://openalex.org/W6847363464","https://openalex.org/W6849105126","https://openalex.org/W6850936240","https://openalex.org/W6851592950","https://openalex.org/W6851762504","https://openalex.org/W6851950068","https://openalex.org/W6854866820","https://openalex.org/W6855932524","https://openalex.org/W6857054612","https://openalex.org/W6857577164","https://openalex.org/W6857614378","https://openalex.org/W6859099255","https://openalex.org/W6859201984","https://openalex.org/W6860041859"],"related_works":["https://openalex.org/W2384605597","https://openalex.org/W4288267738","https://openalex.org/W2964413124","https://openalex.org/W4388937922","https://openalex.org/W4394050964","https://openalex.org/W3113264705","https://openalex.org/W2551249631","https://openalex.org/W4287644835","https://openalex.org/W3092281475","https://openalex.org/W3098003361"],"abstract_inverted_index":{"Question-answering":[0],"(QA)":[1],"is":[2,27],"a":[3,10,19,47,59,109,142],"natural":[4,135],"approach":[5],"for":[6,16],"humans":[7],"to":[8,32,123,133,157],"understand":[9],"piece":[11],"of":[12,25,35,40,62,106,161],"music":[13,26,38,48,107,174],"audio.":[14],"However,":[15],"machines,":[17],"accessing":[18],"large-scale":[20],"dataset":[21,50,179],"covering":[22],"diverse":[23,82],"aspects":[24],"crucial,":[28],"yet":[29],"challenging,":[30],"due":[31],"the":[33,53,103,147,159],"scarcity":[34],"publicly":[36],"available":[37],"data":[39],"this":[41],"type.":[42],"This":[43],"paper":[44],"introduces":[45],"MQAD,":[46,113],"QA":[49,137],"built":[51],"on":[52,168],"Million":[54],"Song":[55],"Dataset":[56],"(MSD),":[57],"encompassing":[58],"rich":[60],"array":[61],"musical":[63,93,126],"features":[64,127],"-":[65],"including":[66],"beat,":[67],"chord,":[68],"key,":[69],"structure,":[70],"instrument,":[71],"and":[72,84,98,128,149,180],"genre":[73],"\u2014":[74],"across":[75],"270,000":[76],"tracks,":[77],"featuring":[78],"nearly":[79],"3":[80],"million":[81],"questions":[83],"captions.":[85],"MQAD":[86,169],"distinguishes":[87],"itself":[88],"by":[89],"offering":[90],"detailed":[91],"time-varying":[92],"information":[94],"such":[95],"as":[96],"chords":[97],"sections,":[99],"enabling":[100],"exploration":[101],"into":[102],"inherent":[104],"structure":[105],"within":[108],"song.":[110],"To":[111],"compile":[112],"our":[114,165],"methodology":[115],"leverages":[116],"specialized":[117],"Music":[118],"Information":[119],"Retrieval":[120],"(MIR)":[121],"models":[122],"extract":[124],"higher-level":[125],"Large":[129],"Language":[130],"Models":[131],"(LLMs)":[132],"generate":[134],"language":[136],"pairs.":[138],"Then,":[139],"we":[140],"leverage":[141],"multimodal":[143],"LLM":[144],"that":[145],"integrates":[146],"LLaMA2":[148],"Whisper":[150],"architectures,":[151],"along":[152],"with":[153],"novel":[154],"subjective":[155],"metrics":[156],"assess":[158],"performance":[160],"MQAD.":[162],"In":[163],"experiments,":[164],"model":[166],"trained":[167],"demonstrates":[170],"advancements":[171],"over":[172],"conventional":[173],"audio":[175],"captioning":[176],"approaches.":[177],"The":[178],"codes":[181],"are":[182],"at":[183],"https://github.com/oyzh888/MQAD.":[184]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-30T08:08:38.191290","created_date":"2025-10-10T00:00:00"}
