{"id":"https://openalex.org/W7108698057","doi":"https://doi.org/10.5281/zenodo.17811454","title":"STAGE: Stemmed Accompaniment Generation through Prefix-Based Conditioning","display_name":"STAGE: Stemmed Accompaniment Generation through Prefix-Based Conditioning","publication_year":2025,"publication_date":"2025-09-21","ids":{"openalex":"https://openalex.org/W7108698057","doi":"https://doi.org/10.5281/zenodo.17811454"},"language":null,"primary_location":{"id":"doi:10.5281/zenodo.17811454","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811454","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.5281/zenodo.17811454","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Giorgio Strano","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Giorgio Strano","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Chiara Ballanti","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chiara Ballanti","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Donato Crisostomi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Donato Crisostomi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Michele Mancusi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Michele Mancusi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Luca Cosmo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luca Cosmo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Emanuele Rodol\u00e0","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Emanuele Rodol\u00e0","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.53760033,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.7890999913215637,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.7890999913215637,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.09650000184774399,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.019200000911951065,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.6287999749183655},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6258999705314636},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5921000242233276},{"id":"https://openalex.org/keywords/coherence","display_name":"Coherence (philosophical gambling strategy)","score":0.5586000084877014},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.5382000207901001},{"id":"https://openalex.org/keywords/musical","display_name":"Musical","score":0.47290000319480896},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.46470001339912415},{"id":"https://openalex.org/keywords/context-model","display_name":"Context model","score":0.38260000944137573}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6340000033378601},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.6287999749183655},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6258999705314636},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5921000242233276},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.5586000084877014},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.5382000207901001},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.47290000319480896},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.46470001339912415},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4449000060558319},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.38260000944137573},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3808000087738037},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32989999651908875},{"id":"https://openalex.org/C2983311337","wikidata":"https://www.wikidata.org/wiki/Q34379","display_name":"Musical instrument","level":2,"score":0.3237000107765198},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.32199999690055847},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.3127000033855438},{"id":"https://openalex.org/C45262634","wikidata":"https://www.wikidata.org/wiki/Q5159291","display_name":"Conditioning","level":2,"score":0.302700012922287},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.29269999265670776},{"id":"https://openalex.org/C109568592","wikidata":"https://www.wikidata.org/wiki/Q207628","display_name":"Musical composition","level":3,"score":0.288100004196167},{"id":"https://openalex.org/C2777946086","wikidata":"https://www.wikidata.org/wiki/Q1163335","display_name":"Music information retrieval","level":3,"score":0.2793000042438507},{"id":"https://openalex.org/C135343436","wikidata":"https://www.wikidata.org/wiki/Q170406","display_name":"Rhythm","level":2,"score":0.2759999930858612},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.2612000107765198},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.25540000200271606},{"id":"https://openalex.org/C2993931450","wikidata":"https://www.wikidata.org/wiki/Q639197","display_name":"Instrumental music","level":3,"score":0.2533000111579895}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5281/zenodo.17811454","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811454","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":""}],"best_oa_location":{"id":"doi:10.5281/zenodo.17811454","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.17811454","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.4410993754863739,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,174],"generative":[3],"models":[4,23,41],"have":[5],"made":[6],"it":[7],"possible":[8],"to":[9,38,64,94,96,104],"create":[10],"high-quality,":[11],"coherent":[12],"music,":[13],"with":[14,87,114,124,143],"some":[15],"systems":[16],"delivering":[17],"production-level":[18],"output.":[19],"Yet,":[20],"most":[21],"existing":[22],"focus":[24],"solely":[25],"on":[26,70,130],"generating":[27],"music":[28,165],"from":[29,60],"scratch,":[30],"limiting":[31],"their":[32],"usefulness":[33],"for":[34,78,163],"musicians":[35,173],"who":[36],"want":[37],"integrate":[39],"such":[40],"into":[42],"a":[43,71,88,97,131,155,159],"human,":[44],"iterative":[45],"composition":[46],"workflow.":[47],"In":[48],"this":[49],"paper":[50],"we":[51,81],"introduce":[52],"STAGE,":[53],"our":[54,134],"STemmed":[55],"Accompaniment":[56],"GEneration":[57],"model,":[58],"fine-tuned":[59],"the":[61,83,92,105,115,144],"state-of-the-art":[62,141],"MusicGen":[63],"generate":[65],"single-stem":[66],"instrumental":[67],"accompaniments":[68,109],"conditioned":[69],"given":[72],"mixture.":[73],"Inspired":[74],"by":[75,128,172],"instruction-tuning":[76],"methods":[77],"language":[79],"models,":[80],"extend":[82],"transformer's":[84],"embedding":[85],"matrix":[86],"context":[89,99],"token,":[90],"enabling":[91],"model":[93],"attend":[95],"musical":[98],"through":[100],"prefix-based":[101],"conditioning.":[102],"Compared":[103],"baselines,":[106],"STAGE":[107,157],"yields":[108],"that":[110,167],"exhibit":[111],"stronger":[112],"coherence":[113],"input":[116],"mixture,":[117],"higher":[118],"audio":[119],"quality,":[120],"and":[121],"closer":[122],"alignment":[123,142],"textual":[125],"prompts.":[126],"Moreover,":[127],"conditioning":[129],"metronome-like":[132],"track,":[133],"framework":[135],"naturally":[136],"supports":[137],"tempo-constrained":[138],"generation,":[139],"achieving":[140],"target":[145],"rhythmic":[146],"structure--all":[147],"without":[148],"requiring":[149],"any":[150],"additional":[151],"tempo-specific":[152],"module.":[153],"As":[154],"result,":[156],"offers":[158],"practical,":[160],"versatile":[161],"tool":[162],"interactive":[164],"creation":[166],"can":[168],"be":[169],"readily":[170],"adopted":[171],"real-world":[175],"workflows.":[176]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-12-05T00:00:00"}
