{"id":"https://openalex.org/W4392909680","doi":"https://doi.org/10.1109/icassp48485.2024.10447392","title":"Stack-and-Delay: A New Codebook Pattern for Music Generation","display_name":"Stack-and-Delay: A New Codebook Pattern for Music Generation","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392909680","doi":"https://doi.org/10.1109/icassp48485.2024.10447392"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10447392","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp48485.2024.10447392","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5012026156","display_name":"Ga\u00ebl Le Lan","orcid":"https://orcid.org/0000-0002-1493-5777"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Gael Le Lan","raw_affiliation_strings":["Meta AI"],"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034208747","display_name":"Varun Nagaraja","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Varun Nagaraja","raw_affiliation_strings":["Meta AI"],"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021305048","display_name":"Ernie Chang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ernie Chang","raw_affiliation_strings":["Meta AI"],"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019683792","display_name":"David Kant","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"David Kant","raw_affiliation_strings":["Meta AI"],"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031088292","display_name":"Zhaoheng Ni","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhaoheng Ni","raw_affiliation_strings":["Meta AI"],"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103247973","display_name":"Yangyang Shi","orcid":"https://orcid.org/0000-0001-5297-4155"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yangyang Shi","raw_affiliation_strings":["Meta AI"],"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031485831","display_name":"Forrest Iandola","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Forrest Iandola","raw_affiliation_strings":["Meta AI"],"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016704219","display_name":"Vikas Chandra","orcid":"https://orcid.org/0009-0005-4996-8455"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vikas Chandra","raw_affiliation_strings":["Meta AI"],"affiliations":[{"raw_affiliation_string":"Meta AI","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5012026156"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.3768,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.4972426,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":"34","issue":null,"first_page":"796","last_page":"800"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9664000272750854,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codebook","display_name":"Codebook","score":0.8909874558448792},{"id":"https://openalex.org/keywords/stack","display_name":"Stack (abstract data type)","score":0.6550511717796326},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6387037038803101},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.17360171675682068},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.07820022106170654}],"concepts":[{"id":"https://openalex.org/C127759330","wikidata":"https://www.wikidata.org/wiki/Q637416","display_name":"Codebook","level":2,"score":0.8909874558448792},{"id":"https://openalex.org/C9395851","wikidata":"https://www.wikidata.org/wiki/Q177929","display_name":"Stack (abstract data type)","level":2,"score":0.6550511717796326},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6387037038803101},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.17360171675682068},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.07820022106170654}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10447392","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icassp48485.2024.10447392","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W2526050071","https://openalex.org/W3162926177","https://openalex.org/W3215615641","https://openalex.org/W4288089799","https://openalex.org/W4307323391","https://openalex.org/W4313021454","https://openalex.org/W4313679638","https://openalex.org/W4318351475","https://openalex.org/W4319989813","https://openalex.org/W4322718191","https://openalex.org/W4377010126","https://openalex.org/W4378499140","https://openalex.org/W4380136719","https://openalex.org/W4383993968","https://openalex.org/W4401110409","https://openalex.org/W6769627184","https://openalex.org/W6795288823","https://openalex.org/W6845479124","https://openalex.org/W6848735303","https://openalex.org/W6849105126","https://openalex.org/W6849635556","https://openalex.org/W6850625674","https://openalex.org/W6852824296","https://openalex.org/W6853096648","https://openalex.org/W6853188576","https://openalex.org/W6854494257"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2293149949","https://openalex.org/W2026099691","https://openalex.org/W4284672201","https://openalex.org/W2377486419","https://openalex.org/W2943202426","https://openalex.org/W2736714427","https://openalex.org/W2163679795","https://openalex.org/W2137816434"],"abstract_inverted_index":{"Language":[0],"modeling":[1],"based":[2],"music":[3],"generation":[4],"relies":[5],"on":[6],"discrete":[7,24],"representations":[8],"of":[9,23,40,85,108,112,164],"audio":[10,13],"frames.":[11],"An":[12],"frame":[14],"(e.g.":[15,26],"20ms)":[16],"is":[17,45],"typically":[18,35],"represented":[19],"by":[20,29,141,153],"a":[21,30,37,81,96],"set":[22],"codes":[25,41,67],"4)":[27],"computed":[28],"neural":[31],"codec.":[32],"Autoregressive":[33],"decoding":[34,60,86],"generates":[36],"few":[38],"thousands":[39],"per":[42],"song,":[43],"which":[44,147],"prohibitively":[46],"slow":[47],"and":[48,144,161,166],"implies":[49],"introducing":[50],"some":[51],"parallel":[52,72],"decoding.":[53],"In":[54],"this":[55],"paper":[56],"we":[57],"compare":[58],"different":[59],"strategies":[61],"that":[62,107,149],"aim":[63],"to":[64,87,106],"understand":[65],"what":[66],"can":[68],"be":[69],"decoded":[70],"in":[71,127],"without":[73],"penalizing":[74],"the":[75,90,109,113,118,123,132,150,154],"quality":[76],"too":[77],"much.":[78],"We":[79],"propose":[80],"novel":[82],"stack-and-delay":[83],"style":[84],"improve":[88],"upon":[89],"vanilla":[91,135],"(flattened":[92],"codes)":[93],"decoding,":[94],"with":[95,134],"4":[97],"fold":[98],"inference":[99,103,120],"speedup.":[100],"This":[101],"brings":[102],"speed":[104],"close":[105],"previous":[110],"state":[111],"art":[114],"(delay":[115],"strategy).":[116],"For":[117],"same":[119],"efficiency":[121],"budget":[122],"proposed":[124],"approach":[125],"outperforms":[126],"objective":[128],"evaluations,":[129],"almost":[130],"closing":[131],"gap":[133],"quality-wise.":[136],"The":[137],"results":[138],"are":[139],"supported":[140],"spectral":[142],"analysis":[143],"listening":[145],"tests,":[146],"demonstrate":[148],"samples":[151],"produced":[152],"new":[155],"model":[156],"exhibit":[157],"improved":[158],"high-frequency":[159],"rendering":[160],"better":[162],"maintenance":[163],"harmonics":[165],"rhythm":[167],"patterns.":[168]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2025-12-23T23:11:35.936235","created_date":"2025-10-10T00:00:00"}
