{"id":"https://openalex.org/W7138037538","doi":"https://doi.org/10.1609/aaai.v40i28.39483","title":"MIDILM: A Dual-Path Model for Controllable Text-to-MIDI Generation","display_name":"MIDILM: A Dual-Path Model for Controllable Text-to-MIDI Generation","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138037538","doi":"https://doi.org/10.1609/aaai.v40i28.39483"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i28.39483","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i28.39483","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i28.39483","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100733008","display_name":"Shuyu Li","orcid":"https://orcid.org/0009-0003-8426-1215"},"institutions":[{"id":"https://openalex.org/I205490536","display_name":"Dongguk University","ror":"https://ror.org/057q6n778","country_code":"KR","type":"education","lineage":["https://openalex.org/I205490536"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Shuyu Li","raw_affiliation_strings":["Dongguk University"],"affiliations":[{"raw_affiliation_string":"Dongguk University","institution_ids":["https://openalex.org/I205490536"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129682233","display_name":"Dooho Choi","orcid":null},"institutions":[{"id":"https://openalex.org/I205490536","display_name":"Dongguk University","ror":"https://ror.org/057q6n778","country_code":"KR","type":"education","lineage":["https://openalex.org/I205490536"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Dooho Choi","raw_affiliation_strings":["Dongguk University"],"affiliations":[{"raw_affiliation_string":"Dongguk University","institution_ids":["https://openalex.org/I205490536"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5054124766","display_name":"Yunsick Sung","orcid":null},"institutions":[{"id":"https://openalex.org/I205490536","display_name":"Dongguk University","ror":"https://ror.org/057q6n778","country_code":"KR","type":"education","lineage":["https://openalex.org/I205490536"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Yunsick Sung","raw_affiliation_strings":["Dongguk University"],"affiliations":[{"raw_affiliation_string":"Dongguk University","institution_ids":["https://openalex.org/I205490536"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100733008"],"corresponding_institution_ids":["https://openalex.org/I205490536"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.32985075,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"28","first_page":"23160","last_page":"23168"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.5461999773979187,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.5461999773979187,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11574","display_name":"Artificial Intelligence in Games","score":0.06920000165700912,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.0560000017285347,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5843999981880188},{"id":"https://openalex.org/keywords/coherence","display_name":"Coherence (philosophical gambling strategy)","score":0.567300021648407},{"id":"https://openalex.org/keywords/controllability","display_name":"Controllability","score":0.5315999984741211},{"id":"https://openalex.org/keywords/feed-forward","display_name":"Feed forward","score":0.45179998874664307},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.4092000126838684},{"id":"https://openalex.org/keywords/music-information-retrieval","display_name":"Music information retrieval","score":0.3723999857902527},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.36559998989105225},{"id":"https://openalex.org/keywords/musical","display_name":"Musical","score":0.3598000109195709}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7304999828338623},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5843999981880188},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.567300021648407},{"id":"https://openalex.org/C48209547","wikidata":"https://www.wikidata.org/wiki/Q1331104","display_name":"Controllability","level":2,"score":0.5315999984741211},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45399999618530273},{"id":"https://openalex.org/C38858127","wikidata":"https://www.wikidata.org/wiki/Q5441228","display_name":"Feed forward","level":2,"score":0.45179998874664307},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.4092000126838684},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3937000036239624},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.37619999051094055},{"id":"https://openalex.org/C2777946086","wikidata":"https://www.wikidata.org/wiki/Q1163335","display_name":"Music information retrieval","level":3,"score":0.3723999857902527},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.36559998989105225},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.3598000109195709},{"id":"https://openalex.org/C115051666","wikidata":"https://www.wikidata.org/wiki/Q6522493","display_name":"Ranging","level":2,"score":0.3215999901294708},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.30489999055862427},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.29840001463890076},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.27880001068115234},{"id":"https://openalex.org/C90312973","wikidata":"https://www.wikidata.org/wiki/Q7449052","display_name":"Semantic data model","level":2,"score":0.2700999975204468},{"id":"https://openalex.org/C2985684807","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Text generation","level":2,"score":0.26910001039505005},{"id":"https://openalex.org/C44819458","wikidata":"https://www.wikidata.org/wiki/Q27939","display_name":"Singing","level":2,"score":0.26179999113082886},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.25459998846054077}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i28.39483","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i28.39483","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i28.39483","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i28.39483","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Text-to-MIDI":[0],"generation":[1],"offers":[2],"editable":[3],"and":[4,23,60,65,85,122,135,155],"hierarchical":[5],"control":[6],"over":[7],"symbolic":[8],"music":[9,25,43,159],"generation.":[10,160],"Previous":[11],"approaches":[12],"either":[13],"convert":[14],"text":[15,40,76],"into":[16,153],"a":[17,55,79,93,146],"limited":[18],"set":[19],"of":[20,49,57],"musical":[21,86],"attributes":[22],"generate":[24],"based":[26],"on":[27,113,117],"these":[28,72],"attributes,":[29],"which":[30,70],"limits":[31],"semantic":[32,120,133],"controllability,":[33],"or":[34],"use":[35],"end-to-end":[36],"models":[37],"that":[38,82,141],"map":[39],"directly":[41],"to":[42,115,130],"without":[44],"deeply":[45],"aligning":[46],"the":[47,99,104],"features":[48],"both":[50,132],"modalities,":[51],"often":[52],"resulting":[53],"in":[54,62],"lack":[56],"structural":[58,123,136],"coherence":[59],"mismatches":[61],"key,":[63],"meter,":[64],"tempo.":[66],"We":[67],"propose":[68],"MIDILM,":[69],"addresses":[71],"limitations":[73],"by":[74],"employing":[75],"conditioning":[77],"with":[78,107],"dual-path":[80],"decoder":[81],"processes":[83],"textual":[84],"information":[87],"through":[88],"separate":[89],"feedforward":[90],"paths":[91],"following":[92],"shared":[94],"masked":[95],"self-attention":[96],"mechanism.":[97],"On":[98],"MidiCaps":[100],"benchmark,":[101],"MIDILM":[102,142],"outperformed":[103],"strongest":[105],"baseline,":[106],"relative":[108],"improvements":[109],"ranging":[110],"from":[111],"6.07%":[112],"CLAP":[114],"144.77%":[116],"TB":[118],"across":[119],"alignment":[121],"metrics.":[124],"These":[125],"gains":[126],"confirm":[127],"its":[128],"ability":[129],"enhance":[131],"controllability":[134],"coherence.":[137],"Collectively,":[138],"we":[139],"expect":[140],"will":[143],"serve":[144],"as":[145],"useful":[147],"reference":[148],"framework":[149],"for":[150],"future":[151],"investigations":[152],"controllable":[154],"structurally":[156],"faithful":[157],"cross-modal":[158]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
