{"id":"https://openalex.org/W7154316316","doi":"https://doi.org/10.48550/arxiv.2604.09803","title":"MAGE: Modality-Agnostic Music Generation and Editing","display_name":"MAGE: Modality-Agnostic Music Generation and Editing","publication_year":2026,"publication_date":"2026-04-10","ids":{"openalex":"https://openalex.org/W7154316316","doi":"https://doi.org/10.48550/arxiv.2604.09803"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.09803","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09803","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.09803","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101325356","display_name":"Muhammad Usama Saleem","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Saleem, Muhammad Usama","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133596244","display_name":"Tejasvi Ravi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ravi, Tejasvi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133582197","display_name":"Tianyu Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Tianyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120743152","display_name":"Rajeev Nongpiur","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nongpiur, Rajeev","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080872663","display_name":"Ishan Chatterjee","orcid":"https://orcid.org/0000-0002-2123-6392"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chatterjee, Ishan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133597732","display_name":"Mayur Jagdishbhai Patel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Patel, Mayur Jagdishbhai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133564388","display_name":"Pu Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Pu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.3425999879837036,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.3425999879837036,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.3343999981880188,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.07739999890327454,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/multimodal-learning","display_name":"Multimodal learning","score":0.5720999836921692},{"id":"https://openalex.org/keywords/concatenation","display_name":"Concatenation (mathematics)","score":0.44679999351501465},{"id":"https://openalex.org/keywords/multimodal-interaction","display_name":"Multimodal interaction","score":0.4129999876022339},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.39739999175071716},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.373199999332428},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.3440000116825104},{"id":"https://openalex.org/keywords/memorization","display_name":"Memorization","score":0.32330000400543213},{"id":"https://openalex.org/keywords/spurious-relationship","display_name":"Spurious relationship","score":0.32100000977516174},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3034000098705292}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7978000044822693},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.5720999836921692},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5013999938964844},{"id":"https://openalex.org/C87619178","wikidata":"https://www.wikidata.org/wiki/Q126002","display_name":"Concatenation (mathematics)","level":2,"score":0.44679999351501465},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4147999882698059},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.4129999876022339},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.39739999175071716},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3772999942302704},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.373199999332428},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3440000116825104},{"id":"https://openalex.org/C30038468","wikidata":"https://www.wikidata.org/wiki/Q4354775","display_name":"Memorization","level":2,"score":0.32330000400543213},{"id":"https://openalex.org/C97256817","wikidata":"https://www.wikidata.org/wiki/Q1462316","display_name":"Spurious relationship","level":2,"score":0.32100000977516174},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3034000098705292},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.3019999861717224},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.3012999892234802},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.2962000072002411},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2863999903202057},{"id":"https://openalex.org/C111370547","wikidata":"https://www.wikidata.org/wiki/Q7451120","display_name":"Sensory cue","level":2,"score":0.28459998965263367},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.28209999203681946},{"id":"https://openalex.org/C38956757","wikidata":"https://www.wikidata.org/wiki/Q716215","display_name":"Audio feedback","level":2,"score":0.2718999981880188},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.27090001106262207},{"id":"https://openalex.org/C194147245","wikidata":"https://www.wikidata.org/wiki/Q1076368","display_name":"Chord (peer-to-peer)","level":2,"score":0.2669000029563904},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.2667999863624573},{"id":"https://openalex.org/C161615301","wikidata":"https://www.wikidata.org/wiki/Q309396","display_name":"Keystroke logging","level":2,"score":0.25859999656677246},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.2578999996185303}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.09803","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09803","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.09803","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09803","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.6048723459243774,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0,99],"music":[1,24,81,204,222],"creation":[2],"requires":[3],"models":[4],"that":[5,78,104,142,171,199],"can":[6],"both":[7],"generate":[8],"audio":[9,135,154],"from":[10,146],"high-level":[11],"cues":[12,151],"and":[13,32,64,70,83,111,137,149,180,206,216],"edit":[14],"existing":[15],"mixtures":[16],"in":[17],"a":[18,29,33,75,87,97,101,138,167,214],"targeted":[19,207],"manner.":[20],"Yet":[21],"most":[22],"multimodal":[23,80],"systems":[25],"are":[26],"built":[27],"for":[28,109,133],"single":[30,88],"task":[31],"fixed":[34],"prompting":[35],"interface,":[36],"making":[37],"their":[38],"conditioning":[39],"brittle":[40],"when":[41],"guidance":[42],"is":[43],"ambiguous,":[44],"temporally":[45,129],"misaligned,":[46],"or":[47,53],"partially":[48],"missing.":[49],"Common":[50],"additive":[51],"fusion":[52],"feature":[54],"concatenation":[55],"further":[56],"weakens":[57],"cross-modal":[58],"grounding,":[59,121],"often":[60],"causing":[61],"prompt":[62],"drift":[63],"spurious":[65],"musical":[66],"content":[67],"during":[68],"generation":[69,82,205],"editing.":[71],"We":[72],"propose":[73],"MAGE,":[74],"modality-agnostic":[76],"framework":[77],"unifies":[79],"mixture-grounded":[84],"editing":[85,112],"within":[86],"continuous":[89],"latent":[90,107],"formulation.":[91],"At":[92],"its":[93],"core,":[94],"MAGE":[95,200],"uses":[96],"Controlled":[98],"FluxFormer,":[100],"flow-based":[102],"Transformer":[103],"learns":[105],"controllable":[106],"trajectories":[108],"synthesis":[110],"under":[113,186],"any":[114],"available":[115],"subset":[116],"of":[117],"conditions.":[118],"To":[119],"improve":[120],"we":[122,164],"introduce":[123],"Audio-Visual":[124],"Nexus":[125],"Alignment":[126],"to":[127,152,175,220],"select":[128],"consistent":[130],"visual":[131,148],"evidence":[132],"the":[134,153,173,195],"timeline,":[136],"cross-gated":[139],"modulation":[140],"mechanism":[141],"applies":[143],"multiplicative":[144],"control":[145],"aligned":[147],"textual":[150],"latents,":[155],"suppressing":[156],"unsupported":[157],"components":[158],"rather":[159],"than":[160],"injecting":[161],"them.":[162],"Finally,":[163],"train":[165],"with":[166],"dynamic":[168],"modality-masking":[169],"curriculum":[170],"exposes":[172],"model":[174],"text-only,":[176],"visual-only,":[177],"joint":[178],"multimodal,":[179],"mixture-guided":[181],"settings,":[182],"enabling":[183],"robust":[184],"inference":[185],"missing":[187],"modalities":[188],"without":[189],"training":[190],"separate":[191],"models.":[192],"Experiments":[193],"on":[194],"MUSIC":[196],"benchmark":[197],"show":[198],"supports":[201],"effective":[202],"multimodal-guided":[203],"editing,":[208],"achieving":[209],"competitive":[210],"quality":[211],"while":[212],"offering":[213],"lightweight":[215],"flexible":[217],"interface":[218],"tailored":[219],"practical":[221],"workflows.":[223]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-15T00:00:00"}
