{"id":"https://openalex.org/W7154255908","doi":"https://doi.org/10.48550/arxiv.2604.11052","title":"LaDA-Band: Language Diffusion Models for Vocal-to-Accompaniment Generation","display_name":"LaDA-Band: Language Diffusion Models for Vocal-to-Accompaniment Generation","publication_year":2026,"publication_date":"2026-04-13","ids":{"openalex":"https://openalex.org/W7154255908","doi":"https://doi.org/10.48550/arxiv.2604.11052"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.11052","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11052","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.11052","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133558879","display_name":"Qi Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Qi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023216625","display_name":"Zhexu Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Zhexu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133583412","display_name":"Meng Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Meng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133586788","display_name":"Guoxin Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Guoxin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133610619","display_name":"Chaoxu Pang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pang, Chaoxu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133589331","display_name":"Weifeng Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Weifeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100583054","display_name":"Wenjiang Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Wenjiang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5133558879"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.26739999651908875,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.26739999651908875,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.16930000483989716,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.16859999299049377,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5271999835968018},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.5054000020027161},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.4724999964237213},{"id":"https://openalex.org/keywords/coherence","display_name":"Coherence (philosophical gambling strategy)","score":0.4530999958515167},{"id":"https://openalex.org/keywords/orchestration","display_name":"Orchestration","score":0.4438000023365021},{"id":"https://openalex.org/keywords/high-fidelity","display_name":"High fidelity","score":0.39640000462532043},{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.3594000041484833},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.33180001378059387},{"id":"https://openalex.org/keywords/parametric-statistics","display_name":"Parametric statistics","score":0.31850001215934753}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7232000231742859},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5271999835968018},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.5054000020027161},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.4724999964237213},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.4530999958515167},{"id":"https://openalex.org/C199168358","wikidata":"https://www.wikidata.org/wiki/Q3367000","display_name":"Orchestration","level":3,"score":0.4438000023365021},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4375},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.39640000462532043},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.3594000041484833},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.33180001378059387},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.33059999346733093},{"id":"https://openalex.org/C117251300","wikidata":"https://www.wikidata.org/wiki/Q1849855","display_name":"Parametric statistics","level":2,"score":0.31850001215934753},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.3052999973297119},{"id":"https://openalex.org/C130727458","wikidata":"https://www.wikidata.org/wiki/Q1639109","display_name":"Coarticulation","level":3,"score":0.301800012588501},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.30090001225471497},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.3005000054836273},{"id":"https://openalex.org/C2983311337","wikidata":"https://www.wikidata.org/wiki/Q34379","display_name":"Musical instrument","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.2851000130176544},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.28200000524520874},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.2815000116825104},{"id":"https://openalex.org/C4069607","wikidata":"https://www.wikidata.org/wiki/Q868732","display_name":"Aliasing","level":3,"score":0.2741999924182892},{"id":"https://openalex.org/C5297727","wikidata":"https://www.wikidata.org/wiki/Q786970","display_name":"Autocorrelation","level":2,"score":0.2734000086784363},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2623000144958496},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.11052","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11052","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.11052","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.11052","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vocal-to-accompaniment":[0],"(V2A)":[1],"generation,":[2],"which":[3],"aims":[4],"to":[5,61,96,170,175],"transform":[6],"a":[7,12,38,110,151,166],"raw":[8],"vocal":[9,31],"recording":[10],"into":[11],"fully":[13],"arranged":[14],"accompaniment,":[15],"inherently":[16],"requires":[17],"jointly":[18],"addressing":[19],"an":[20,88,155],"accompaniment":[21,163],"trilemma:":[22],"preserving":[23,140],"acoustic":[24,64,142,192],"authenticity,":[25,193],"maintaining":[26,203],"global":[27,194],"coherence":[28],"with":[29,125],"the":[30,97,117],"track,":[32],"and":[33,79,136,165,184,196,212],"producing":[34],"dynamic":[35,197],"orchestration":[36,198],"across":[37],"full":[39],"song.":[40],"Existing":[41],"open-source":[42],"approaches":[43],"typically":[44],"make":[45],"compromises":[46],"among":[47],"these":[48],"goals.":[49],"Continuous-latent":[50],"generation":[51,78,104],"models":[52,70],"can":[53],"capture":[54],"long":[55],"musical":[56],"spans":[57],"but":[58,74],"often":[59],"struggle":[60],"preserve":[62],"fine-grained":[63],"detail.":[65],"In":[66],"contrast,":[67],"discrete":[68,121],"autoregressive":[69],"retain":[71],"local":[72],"fidelity":[73],"suffer":[75],"from":[76],"unidirectional":[77],"error":[80],"accumulation":[81],"in":[82],"extended":[83],"contexts.":[84],"We":[85],"present":[86],"LaDA-Band,":[87],"end-to-end":[89],"framework":[90],"that":[91,115,188],"introduces":[92,150],"Discrete":[93,106,172],"Masked":[94,107,173],"Diffusion":[95,174],"V2A":[98,103],"task.":[99],"Our":[100],"approach":[101],"formulates":[102],"as":[105],"Diffusion,":[108],"i.e.,":[109],"global,":[111],"non-autoregressive":[112],"denoising":[113],"formulation":[114],"combines":[116],"representational":[118],"advantages":[119],"of":[120],"audio":[122,213],"codec":[123],"tokens":[124],"full-sequence":[126],"bidirectional":[127],"context":[128],"modeling.":[129],"This":[130],"design":[131],"improves":[132,191],"long-range":[133],"structural":[134],"consistency":[135],"temporal":[137],"synchronization":[138],"while":[139,202],"crisp":[141],"details.":[143],"Built":[144],"on":[145,181],"this":[146],"formulation,":[147],"LaDA-Band":[148,189],"further":[149],"dual-track":[152],"prefix-conditioning":[153],"architecture,":[154],"auxiliary":[156,208],"replaced-token":[157],"detection":[158],"objective":[159],"for":[160],"weakly":[161],"anchored":[162],"regions,":[164],"two-stage":[167],"progressive":[168],"curriculum":[169],"scale":[171],"full-song":[176],"vocal-to-accompaniment":[177],"generation.":[178],"Extensive":[179],"experiments":[180],"both":[182],"academic":[183],"real-world":[185],"benchmarks":[186],"show":[187],"consistently":[190],"coherence,":[195],"over":[199],"existing":[200],"baselines,":[201],"strong":[204],"performance":[205],"even":[206],"without":[207],"reference":[209],"audio.":[210],"Codes":[211],"samples":[214],"are":[215],"available":[216],"at":[217],"https://github.com/Duoluoluos/TME-LaDA-Band":[218],".":[219]},"counts_by_year":[],"updated_date":"2026-04-15T06:04:33.058270","created_date":"2026-04-15T00:00:00"}
