{"id":"https://openalex.org/W7138470400","doi":"https://doi.org/10.48550/arxiv.2603.15597","title":"AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis with Acoustic Transfer","display_name":"AC-Foley: Reference-Audio-Guided Video-to-Audio Synthesis with Acoustic Transfer","publication_year":2026,"publication_date":"2026-03-16","ids":{"openalex":"https://openalex.org/W7138470400","doi":"https://doi.org/10.48550/arxiv.2603.15597"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.15597","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15597","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.15597","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129649379","display_name":"Pengjun Fang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Fang, Pengjun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129697220","display_name":"Yingqing He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Yingqing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129698148","display_name":"Yazhou Xing","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xing, Yazhou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129731521","display_name":"Qifeng Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Qifeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129711616","display_name":"Ser-Nam Lim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lim, Ser-Nam","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129695899","display_name":"Harry Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Harry","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5129649379"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.5206000208854675,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.5206000208854675,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.14880000054836273,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.09019999951124191,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/timbre","display_name":"Timbre","score":0.6430000066757202},{"id":"https://openalex.org/keywords/ambiguity","display_name":"Ambiguity","score":0.4505000114440918},{"id":"https://openalex.org/keywords/sound","display_name":"Sound (geography)","score":0.43799999356269836},{"id":"https://openalex.org/keywords/sound-recording-and-reproduction","display_name":"Sound recording and reproduction","score":0.41749998927116394},{"id":"https://openalex.org/keywords/granularity","display_name":"Granularity","score":0.4108999967575073},{"id":"https://openalex.org/keywords/audio-feedback","display_name":"Audio feedback","score":0.37400001287460327},{"id":"https://openalex.org/keywords/surround-sound","display_name":"Surround sound","score":0.3716000020503998},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.36800000071525574},{"id":"https://openalex.org/keywords/microphone","display_name":"Microphone","score":0.34369999170303345}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7437999844551086},{"id":"https://openalex.org/C2776539107","wikidata":"https://www.wikidata.org/wiki/Q176501","display_name":"Timbre","level":3,"score":0.6430000066757202},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5303000211715698},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.4505000114440918},{"id":"https://openalex.org/C203718221","wikidata":"https://www.wikidata.org/wiki/Q491713","display_name":"Sound (geography)","level":2,"score":0.43799999356269836},{"id":"https://openalex.org/C128422554","wikidata":"https://www.wikidata.org/wiki/Q20077126","display_name":"Sound recording and reproduction","level":2,"score":0.41749998927116394},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.4108999967575073},{"id":"https://openalex.org/C38956757","wikidata":"https://www.wikidata.org/wiki/Q716215","display_name":"Audio feedback","level":2,"score":0.37400001287460327},{"id":"https://openalex.org/C2780544925","wikidata":"https://www.wikidata.org/wiki/Q569874","display_name":"Surround sound","level":3,"score":0.3716000020503998},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.36800000071525574},{"id":"https://openalex.org/C2778263558","wikidata":"https://www.wikidata.org/wiki/Q46384","display_name":"Microphone","level":3,"score":0.34369999170303345},{"id":"https://openalex.org/C48209547","wikidata":"https://www.wikidata.org/wiki/Q1331104","display_name":"Controllability","level":2,"score":0.3386000096797943},{"id":"https://openalex.org/C499572226","wikidata":"https://www.wikidata.org/wiki/Q1937950","display_name":"Sound design","level":3,"score":0.3330000042915344},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.32839998602867126},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3260999917984009},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.31299999356269836},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.29760000109672546},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.2906000018119812},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.289900004863739},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.2802000045776367},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.2727999985218048},{"id":"https://openalex.org/C88485024","wikidata":"https://www.wikidata.org/wiki/Q1054571","display_name":"Cepstrum","level":2,"score":0.2721000015735626},{"id":"https://openalex.org/C2776175482","wikidata":"https://www.wikidata.org/wiki/Q1195816","display_name":"Transfer (computing)","level":2,"score":0.2680000066757202},{"id":"https://openalex.org/C81299745","wikidata":"https://www.wikidata.org/wiki/Q334269","display_name":"Transfer function","level":2,"score":0.2533999979496002},{"id":"https://openalex.org/C167310288","wikidata":"https://www.wikidata.org/wiki/Q7564808","display_name":"Sound quality","level":2,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.15597","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15597","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.15597","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15597","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.40986233949661255}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Existing":[0],"video-to-audio":[1,136],"(V2A)":[2],"generation":[3,125],"methods":[4,137],"predominantly":[5],"rely":[6],"on":[7,99,128],"text":[8,109],"prompts":[9],"alongside":[10],"visual":[11],"information":[12],"to":[13,48,72],"synthesize":[14],"audio.":[15],"However,":[16],"two":[17],"critical":[18],"bottlenecks":[19,44],"persist:":[20],"semantic":[21,106],"granularity":[22],"gaps":[23],"in":[24,39],"training":[25],"data,":[26],"such":[27],"as":[28],"conflating":[29],"acoustically":[30],"distinct":[31],"sounds":[32],"under":[33],"coarse":[34],"labels,":[35],"and":[36,75,92,143],"textual":[37],"ambiguity":[38],"describing":[40],"micro-acoustic":[41],"features.":[42],"These":[43],"make":[45],"it":[46],"difficult":[47],"perform":[49],"fine-grained":[50,76,84],"sound":[51,85,90],"synthesis":[52],"using":[53],"text-controlled":[54],"modes.":[55],"To":[56],"address":[57],"these":[58],"limitations,":[59],"we":[60],"propose":[61],"AC-Foley,":[62],"an":[63],"audio-conditioned":[64],"V2A":[65],"model":[66],"that":[67],"directly":[68,97],"leverages":[69],"reference":[70,129],"audio":[71,94,100,140],"achieve":[73],"precise":[74,113],"control":[77],"over":[78],"generated":[79],"sounds.":[80],"This":[81],"approach":[82,103],"enables":[83],"synthesis,":[86],"timbre":[87],"transfer,":[88],"zero-shot":[89],"generation,":[91],"improved":[93],"quality.":[95],"By":[96],"conditioning":[98],"signals,":[101],"our":[102],"bypasses":[104],"the":[105],"ambiguities":[107],"of":[108,115],"descriptions":[110],"while":[111,131],"enabling":[112],"manipulation":[114],"acoustic":[116],"attributes.":[117],"Empirically,":[118],"AC-Foley":[119],"achieves":[120],"state-of-the-art":[121,135],"performance":[122],"for":[123],"Foley":[124],"when":[126],"conditioned":[127],"audio,":[130],"remaining":[132],"competitive":[133],"with":[134],"even":[138],"without":[139],"conditioning.":[141],"Code":[142],"demo":[144],"are":[145],"available":[146],"at:":[147],"https://ff2416.github.io/AC-Foley-Page":[148]},"counts_by_year":[],"updated_date":"2026-03-24T05:59:24.953642","created_date":"2026-03-18T00:00:00"}
