{"id":"https://openalex.org/W7161700748","doi":"https://doi.org/10.48550/arxiv.2605.18749","title":"WavFlow: Audio Generation in Waveform Space","display_name":"WavFlow: Audio Generation in Waveform Space","publication_year":2026,"publication_date":"2026-05-18","ids":{"openalex":"https://openalex.org/W7161700748","doi":"https://doi.org/10.48550/arxiv.2605.18749"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.18749","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18749","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.18749","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073170981","display_name":"Feiyan Zhou","orcid":"https://orcid.org/0009-0009-6033-0216"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Feiyan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005564503","display_name":"Luyuan Wang","orcid":"https://orcid.org/0000-0001-5727-5417"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Luyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136494656","display_name":"Shoufa Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Shoufa","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136486896","display_name":"Zhe Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136503671","display_name":"Zhiheng Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zhiheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100622051","display_name":"Yuren Cong","orcid":"https://orcid.org/0000-0001-7505-8563"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cong, Yuren","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136454715","display_name":"Xiaohui Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xiaohui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113814271","display_name":"Fanny Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Fanny","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5082750097","display_name":"Belinda Zeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeng, Belinda","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.31279999017715454,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.31279999017715454,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.2858000099658966,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.11230000108480453,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/waveform","display_name":"Waveform","score":0.7116000056266785},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5839999914169312},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5519999861717224},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5332000255584717},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5281000137329102},{"id":"https://openalex.org/keywords/dynamic-range-compression","display_name":"Dynamic range compression","score":0.5149000287055969},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.4571000039577484},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.3806999921798706},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.37779998779296875}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7476999759674072},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.7116000056266785},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5839999914169312},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5839999914169312},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5519999861717224},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5332000255584717},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5281000137329102},{"id":"https://openalex.org/C150178126","wikidata":"https://www.wikidata.org/wiki/Q18433212","display_name":"Dynamic range compression","level":2,"score":0.5149000287055969},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.4571000039577484},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.42419999837875366},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.3806999921798706},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.37779998779296875},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.34850001335144043},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.33480000495910645},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.3327000141143799},{"id":"https://openalex.org/C24326235","wikidata":"https://www.wikidata.org/wiki/Q126095","display_name":"Electronic engineering","level":1,"score":0.32499998807907104},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.30329999327659607},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2994999885559082},{"id":"https://openalex.org/C68859911","wikidata":"https://www.wikidata.org/wiki/Q1503724","display_name":"Pattern matching","level":2,"score":0.29420000314712524},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2815999984741211},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2791000008583069},{"id":"https://openalex.org/C104267543","wikidata":"https://www.wikidata.org/wiki/Q208163","display_name":"Signal processing","level":3,"score":0.2784000039100647},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.2628999948501587},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.25619998574256897},{"id":"https://openalex.org/C167940747","wikidata":"https://www.wikidata.org/wiki/Q63727227","display_name":"Audio signal flow","level":5,"score":0.2533999979496002},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.2515000104904175},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.25040000677108765},{"id":"https://openalex.org/C103824480","wikidata":"https://www.wikidata.org/wiki/Q185889","display_name":"Time domain","level":2,"score":0.25}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.18749","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18749","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.18749","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18749","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.5049620270729065,"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Modern":[0],"audio":[1,29,51,166],"generation":[2],"predominantly":[3],"relies":[4],"on":[5,115],"latent-space":[6],"compression,":[7],"introducing":[8],"additional":[9],"complexity":[10],"and":[11,46,59,81,126,160],"potential":[12],"information":[13],"loss.":[14],"In":[15],"this":[16,20],"work,":[17],"we":[18,49,84],"challenge":[19],"paradigm":[21],"with":[22],"WavFlow,":[23],"a":[24,152,158],"framework":[25],"that":[26,110,147],"generates":[27],"high-fidelity":[28],"directly":[30],"in":[31,73],"raw":[32],"waveform":[33,57],"space":[34],"without":[35],"intermediate":[36,148],"representations.":[37],"To":[38,76],"overcome":[39],"the":[40,98,116,127,138],"inherent":[41],"difficulties":[42],"of":[43,140],"modeling":[44],"high-dimensional":[45],"low-energy":[47],"signals,":[48],"reshape":[50],"into":[52],"2D":[53],"token":[54],"grids":[55],"through":[56],"patchify":[58],"introduce":[60],"amplitude":[61],"lifting":[62],"to":[63,90,100],"align":[64],"signal":[65],"scales,":[66],"enabling":[67],"stable":[68],"optimization":[69],"via":[70],"direct":[71],"x-prediction":[72],"flow":[74],"matching.":[75],"capture":[77],"complex":[78],"semantic":[79],"alignment":[80],"temporal":[82],"synchronization,":[83],"leverage":[85],"an":[86],"automated":[87],"data":[88],"pipeline":[89],"curate":[91],"5":[92],"million":[93],"high-quality":[94,155],"video-text-audio":[95],"triplets,":[96],"allowing":[97],"model":[99],"learn":[101],"fine-grained":[102],"acoustic":[103],"patterns":[104],"from":[105],"scratch.":[106],"Experimental":[107],"results":[108],"show":[109],"WavFlow":[111],"achieves":[112],"competitive":[113],"performance":[114,139],"video-to-audio":[117],"benchmark":[118,129],"VGGSound":[119],"(FD_PaSST:":[120],"59.98,":[121],"IS_PANNs:":[122,133],"17.40,":[123],"DeSync:":[124],"0.44)":[125],"text-to-audio":[128],"AudioCaps":[130],"(FD_PANNs:":[131],"10.63,":[132],"12.62),":[134],"matching":[135],"or":[136],"exceeding":[137],"established":[141],"latent-based":[142],"methods.":[143],"Our":[144],"work":[145],"demonstrates":[146],"compression":[149],"is":[150],"not":[151],"prerequisite":[153],"for":[154,164],"synthesis,":[156],"offering":[157],"simpler":[159],"more":[161],"scalable":[162],"alternative":[163],"multimodal":[165],"generation.":[167]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-20T00:00:00"}
