{"id":"https://openalex.org/W7130570766","doi":"https://doi.org/10.48550/arxiv.2602.16687","title":"Scaling Open Discrete Audio Foundation Models with Interleaved Semantic, Acoustic, and Text Tokens","display_name":"Scaling Open Discrete Audio Foundation Models with Interleaved Semantic, Acoustic, and Text Tokens","publication_year":2026,"publication_date":"2026-02-18","ids":{"openalex":"https://openalex.org/W7130570766","doi":"https://doi.org/10.48550/arxiv.2602.16687"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.16687","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024447363","display_name":"Potsawee Manakul","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Manakul, Potsawee","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120047473","display_name":"Woody Haosheng Gan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gan, Woody Haosheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033914946","display_name":"Martijn Bartelds","orcid":"https://orcid.org/0000-0003-1006-8669"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bartelds, Martijn","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126433083","display_name":"Guangzhi Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Guangzhi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109972562","display_name":"William A. Held","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Held, William","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126438806","display_name":"Diyi Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Diyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.3172999918460846,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.3172999918460846,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.11599999666213989,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.10189999639987946,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7562000155448914},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.5292999744415283},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.48980000615119934},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.4867999851703644},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4526999890804291},{"id":"https://openalex.org/keywords/foundation","display_name":"Foundation (evidence)","score":0.4318999946117401}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7996000051498413},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7562000155448914},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.5292999744415283},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.48980000615119934},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.4867999851703644},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4526999890804291},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.4318999946117401},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3652999997138977},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3564999997615814},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.3522999882698059},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.35019999742507935},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.319599986076355},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3084000051021576},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.27720001339912415},{"id":"https://openalex.org/C166052673","wikidata":"https://www.wikidata.org/wiki/Q83021","display_name":"Empirical evidence","level":2,"score":0.26899999380111694},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.26589998602867126},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.25839999318122864}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.16687","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.16687","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.16687","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.16687","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.48687195777893066}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Current":[0],"audio":[1,17,21,32,40,55,100],"language":[2],"models":[3,34,101,107,139],"are":[4],"predominantly":[5],"text-first,":[6],"either":[7],"extending":[8],"pre-trained":[9],"text":[10,50,78],"LLM":[11],"backbones":[12],"or":[13],"relying":[14],"on":[15,105,145],"semantic-only":[16],"tokens,":[18,147],"limiting":[19],"general":[20,54],"modeling.":[22],"This":[23],"paper":[24],"presents":[25],"a":[26,86,136,159],"systematic":[27],"empirical":[28,63],"study":[29,97],"of":[30,138],"native":[31],"foundation":[33],"that":[35,114],"apply":[36,126],"next-token":[37],"prediction":[38],"to":[39,51,110,129,142],"at":[41],"scale,":[42],"jointly":[43],"modeling":[44],"semantic":[45],"content,":[46],"acoustic":[47],"details,":[48],"and":[49,57,81,153],"support":[52],"both":[53],"generation":[56],"cross-modal":[58],"capabilities.":[59],"We":[60,70,91,125],"provide":[61],"comprehensive":[62],"insights":[64],"for":[65,98,162,172],"building":[66],"such":[67],"models:":[68],"(1)":[69],"systematically":[71],"investigate":[72],"design":[73],"choices":[74],"--":[75,84,166],"data":[76,116],"sources,":[77],"mixture":[79],"ratios,":[80],"token":[82],"composition":[83],"establishing":[85],"validated":[87],"training":[88],"recipe.":[89],"(2)":[90],"conduct":[92],"the":[93,177],"first":[94],"scaling":[95,151],"law":[96],"discrete":[99],"via":[102],"IsoFLOP":[103],"analysis":[104],"64":[106],"spanning":[108],"$3{\\times}10^{18}$":[109],"$3{\\times}10^{20}$":[111],"FLOPs,":[112],"finding":[113],"optimal":[115,121],"grows":[117],"1.6$\\times$":[118],"faster":[119],"than":[120],"model":[122],"size.":[123],"(3)":[124],"these":[127],"lessons":[128],"train":[130],"SODA":[131,156],"(Scaling":[132],"Open":[133],"Discrete":[134],"Audio),":[135],"suite":[137],"from":[140],"135M":[141],"4B":[143],"parameters":[144],"500B":[146],"comparing":[148],"against":[149],"our":[150],"predictions":[152],"existing":[154],"models.":[155],"serves":[157],"as":[158],"flexible":[160],"backbone":[161],"diverse":[163],"audio/text":[164],"tasks":[165],"we":[167],"demonstrate":[168],"this":[169],"by":[170],"fine-tuning":[171],"voice-preserving":[173],"speech-to-speech":[174],"translation,":[175],"using":[176],"same":[178],"unified":[179],"architecture.":[180]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-20T00:00:00"}
