{"id":"https://openalex.org/W7125921921","doi":"https://doi.org/10.1007/s11263-025-02632-y","title":"Taming Data and Transformers for Audio Generation","display_name":"Taming Data and Transformers for Audio Generation","publication_year":2026,"publication_date":"2026-01-28","ids":{"openalex":"https://openalex.org/W7125921921","doi":"https://doi.org/10.1007/s11263-025-02632-y"},"language":"en","primary_location":{"id":"doi:10.1007/s11263-025-02632-y","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11263-025-02632-y","pdf_url":null,"source":{"id":"https://openalex.org/S25538012","display_name":"International Journal of Computer Vision","issn_l":"0920-5691","issn":["0920-5691","1573-1405"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Journal of Computer Vision","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1007/s11263-025-02632-y","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5090496923","display_name":"Moayed Haji-Ali","orcid":"https://orcid.org/0009-0006-8224-5299"},"institutions":[{"id":"https://openalex.org/I4210142583","display_name":"Snap (United States)","ror":"https://ror.org/04dgkhg68","country_code":"US","type":"company","lineage":["https://openalex.org/I4210142583"]},{"id":"https://openalex.org/I74775410","display_name":"Rice University","ror":"https://ror.org/008zs3103","country_code":"US","type":"education","lineage":["https://openalex.org/I74775410"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Moayed Haji-Ali","raw_affiliation_strings":["Rice University, Houston, USA","Snap Inc, Santa Monica, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Rice University, Houston, USA","institution_ids":["https://openalex.org/I74775410"]},{"raw_affiliation_string":"Snap Inc, Santa Monica, USA","institution_ids":["https://openalex.org/I4210142583"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052645912","display_name":"Willi Menapace","orcid":"https://orcid.org/0000-0002-0715-9300"},"institutions":[{"id":"https://openalex.org/I4210142583","display_name":"Snap (United States)","ror":"https://ror.org/04dgkhg68","country_code":"US","type":"company","lineage":["https://openalex.org/I4210142583"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Willi Menapace","raw_affiliation_strings":["Snap Inc, Santa Monica, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Snap Inc, Santa Monica, USA","institution_ids":["https://openalex.org/I4210142583"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021757527","display_name":"Aliaksandr Siarohin","orcid":"https://orcid.org/0000-0001-9252-1775"},"institutions":[{"id":"https://openalex.org/I4210142583","display_name":"Snap (United States)","ror":"https://ror.org/04dgkhg68","country_code":"US","type":"company","lineage":["https://openalex.org/I4210142583"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Aliaksandr Siarohin","raw_affiliation_strings":["Snap Inc, Santa Monica, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Snap Inc, Santa Monica, USA","institution_ids":["https://openalex.org/I4210142583"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Guha Balakrishnan","orcid":null},"institutions":[{"id":"https://openalex.org/I74775410","display_name":"Rice University","ror":"https://ror.org/008zs3103","country_code":"US","type":"education","lineage":["https://openalex.org/I74775410"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Guha Balakrishnan","raw_affiliation_strings":["Rice University, Houston, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Rice University, Houston, USA","institution_ids":["https://openalex.org/I74775410"]}]},{"author_position":"last","author":{"id":null,"display_name":"Vicente Ordonez","orcid":"https://orcid.org/0009-0006-0279-5275"},"institutions":[{"id":"https://openalex.org/I74775410","display_name":"Rice University","ror":"https://ror.org/008zs3103","country_code":"US","type":"education","lineage":["https://openalex.org/I74775410"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Vicente Ordonez","raw_affiliation_strings":["Rice University, Houston, USA"],"raw_orcid":"https://orcid.org/0009-0006-0279-5275","affiliations":[{"raw_affiliation_string":"Rice University, Houston, USA","institution_ids":["https://openalex.org/I74775410"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I74775410"],"apc_list":{"value":2890,"currency":"EUR","value_usd":3690},"apc_paid":{"value":2890,"currency":"EUR","value_usd":3690},"fwci":43.5768,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.99361362,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":"134","issue":"3","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.4399999976158142,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.4399999976158142,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.2646999955177307,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.04569999873638153,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.8154000043869019},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.775600016117096},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6101999878883362},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6085000038146973},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.3855000138282776},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.3774000108242035},{"id":"https://openalex.org/keywords/data-format","display_name":"Data format","score":0.35920000076293945},{"id":"https://openalex.org/keywords/synthetic-data","display_name":"Synthetic data","score":0.3547999858856201}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.839900016784668},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.8154000043869019},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.775600016117096},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6101999878883362},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6085000038146973},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.448199987411499},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3855000138282776},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.3774000108242035},{"id":"https://openalex.org/C2985331491","wikidata":"https://www.wikidata.org/wiki/Q5227298","display_name":"Data format","level":2,"score":0.35920000076293945},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.35749998688697815},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.3547999858856201},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.3515999913215637},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.34769999980926514},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3375999927520752},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.3127000033855438},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.3050000071525574},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.29010000824928284},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.27799999713897705},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.26930001378059387},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.2603999972343445},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.2572999894618988},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.25119999051094055}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/s11263-025-02632-y","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11263-025-02632-y","pdf_url":null,"source":{"id":"https://openalex.org/S25538012","display_name":"International Journal of Computer Vision","issn_l":"0920-5691","issn":["0920-5691","1573-1405"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Journal of Computer Vision","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1007/s11263-025-02632-y","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s11263-025-02632-y","pdf_url":null,"source":{"id":"https://openalex.org/S25538012","display_name":"International Journal of Computer Vision","issn_l":"0920-5691","issn":["0920-5691","1573-1405"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Journal of Computer Vision","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5686643202","display_name":null,"funder_award_id":"2201710","funder_id":"https://openalex.org/F4320337389","funder_display_name":"Division of Information and Intelligent Systems"}],"funders":[{"id":"https://openalex.org/F4320310133","display_name":"Rice University","ror":"https://ror.org/008zs3103"},{"id":"https://openalex.org/F4320337389","display_name":"Division of Information and Intelligent Systems","ror":"https://ror.org/053a2cp42"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":70,"referenced_works":["https://openalex.org/W1901129140","https://openalex.org/W1956340063","https://openalex.org/W2033875152","https://openalex.org/W2101105183","https://openalex.org/W2133512280","https://openalex.org/W2526050071","https://openalex.org/W2593116425","https://openalex.org/W2949376505","https://openalex.org/W2972478942","https://openalex.org/W2984008963","https://openalex.org/W3015371781","https://openalex.org/W3015591594","https://openalex.org/W3034733309","https://openalex.org/W3034999214","https://openalex.org/W3094550259","https://openalex.org/W3180355996","https://openalex.org/W3195486004","https://openalex.org/W4226289673","https://openalex.org/W4226442948","https://openalex.org/W4312095907","https://openalex.org/W4312864639","https://openalex.org/W4312933868","https://openalex.org/W4313190371","https://openalex.org/W4367359628","https://openalex.org/W4372260310","https://openalex.org/W4372266552","https://openalex.org/W4372341409","https://openalex.org/W4385822467","https://openalex.org/W4385822505","https://openalex.org/W4386076002","https://openalex.org/W4386348132","https://openalex.org/W4387969125","https://openalex.org/W4389519587","https://openalex.org/W4390872297","https://openalex.org/W4392902953","https://openalex.org/W4392903177","https://openalex.org/W4392903479","https://openalex.org/W4392903535","https://openalex.org/W4392909554","https://openalex.org/W4393147046","https://openalex.org/W4393147260","https://openalex.org/W4393160294","https://openalex.org/W4396877837","https://openalex.org/W4400033239","https://openalex.org/W4401043564","https://openalex.org/W4402111756","https://openalex.org/W4402112119","https://openalex.org/W4402112510","https://openalex.org/W4402713100","https://openalex.org/W4402716264","https://openalex.org/W4402727647","https://openalex.org/W4402727849","https://openalex.org/W4402753903","https://openalex.org/W4402915976","https://openalex.org/W4403674716","https://openalex.org/W4403674737","https://openalex.org/W4403780823","https://openalex.org/W4403780831","https://openalex.org/W4403791730","https://openalex.org/W4403791956","https://openalex.org/W4403990754","https://openalex.org/W4404037650","https://openalex.org/W4404784428","https://openalex.org/W4405934446","https://openalex.org/W4408353237","https://openalex.org/W4408354411","https://openalex.org/W4410153385","https://openalex.org/W4413146212","https://openalex.org/W4415433415","https://openalex.org/W4415795395"],"related_works":[],"abstract_inverted_index":{"Abstract":[0],"The":[1],"scalability":[2,17],"of":[3,93,160],"ambient":[4,45,53],"sound":[5],"generators":[6,147],"is":[7],"hindered":[8],"by":[9,26],"data":[10,29,131,153],"scarcity,":[11],"insufficient":[12],"caption":[13,87],"quality,":[14,88],"and":[15,30,38,80,152,182,198],"limited":[16],"in":[18,49,169,180,191],"model":[19,31,139,196],"architecture.":[20],"This":[21],"work":[22],"addresses":[23],"these":[24],"challenges":[25],"advancing":[27],"both":[28],"scaling.":[32,141],"First,":[33],"we":[34,66,110,120],"propose":[35,67,111],"an":[36],"efficient":[37],"scalable":[39,114],"dataset":[40,55,199],"collection":[41],"pipeline":[42],"tailored":[43],"for":[44],"audio":[46,72,82,116,146],"generation,":[47],"resulting":[48],"AutoReCap-XL,":[50],"the":[51],"largest":[52],"audio-text":[54],"with":[56,133],"over":[57,105],"47":[58],"million":[59],"clips.":[60],"To":[61],"provide":[62],"high-quality":[63,70],"textual":[64],"annotations,":[65],"AutoCap,":[68],"a":[69,77,90,95,113],"automatic":[71],"captioning":[73,107],"model.":[74],"By":[75],"adopting":[76],"Q-Former":[78],"module":[79],"leveraging":[81],"metadata,":[83],"AutoCap":[84],"substantially":[85],"enhances":[86],"reaching":[89],"CIDEr":[91],"score":[92],"83.2,":[94],"$$3.2\\%$$":[96],"<mml:math":[97,162,173,184],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\">":[98,163,174,185],"<mml:mrow>":[99,164,175,186],"<mml:mn>3.2</mml:mn>":[100],"<mml:mo>%</mml:mo>":[101,166,177,188],"</mml:mrow>":[102,167,178,189],"</mml:math>":[103,168,179,190],"improvement":[104],"previous":[106],"models.":[108],"Finally,":[109],"GenAu,":[112],"transformer-based":[115],"generation":[117],"architecture":[118],"that":[119],"scale":[121,154],"up":[122],"to":[123,144],"1.25B":[124],"parameters.":[125],"We":[126],"demonstrate":[127],"its":[128],"benefits":[129],"from":[130],"scaling":[132],"synthetic":[134],"captions":[135],"as":[136,138],"well":[137],"size":[140,151],"When":[142],"compared":[143],"baseline":[145],"trained":[148],"at":[149],"similar":[150],",":[155],"GenAu":[156],"obtains":[157],"significant":[158],"improvements":[159],"$$4.7\\%$$":[161],"<mml:mn>4.7</mml:mn>":[165],"FAD":[170],"score,":[171],"$$22.65\\%$$":[172],"<mml:mn>22.65</mml:mn>":[176],"IS,":[181],"$$13.5\\%$$":[183],"<mml:mn>13.5</mml:mn>":[187],"CLAP":[192],"score.":[193],"Our":[194],"code,":[195],"checkpoints,":[197],"are":[200],"publicly":[201],"available":[202],".":[203]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-01-29T00:00:00"}
