{"id":"https://openalex.org/W4404037650","doi":"https://doi.org/10.1109/mlsp58920.2024.10734721","title":"Foleygen: Visually-Guided Audio Generation","display_name":"Foleygen: Visually-Guided Audio Generation","publication_year":2024,"publication_date":"2024-09-22","ids":{"openalex":"https://openalex.org/W4404037650","doi":"https://doi.org/10.1109/mlsp58920.2024.10734721"},"language":"en","primary_location":{"id":"doi:10.1109/mlsp58920.2024.10734721","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mlsp58920.2024.10734721","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 34th International Workshop on Machine Learning for Signal Processing (MLSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5070892237","display_name":"Xinhao Mei","orcid":"https://orcid.org/0000-0001-6079-5130"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xinhao Mei","raw_affiliation_strings":["Meta"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034208747","display_name":"Varun Nagaraja","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Varun Nagaraja","raw_affiliation_strings":["Meta"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012026156","display_name":"Ga\u00ebl Le Lan","orcid":"https://orcid.org/0000-0002-1493-5777"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gael Le Lan","raw_affiliation_strings":["Meta"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031088292","display_name":"Zhaoheng Ni","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhaoheng Ni","raw_affiliation_strings":["Meta"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021305048","display_name":"Ernie Chang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ernie Chang","raw_affiliation_strings":["Meta"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115596237","display_name":"Yangyang Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yangyang Shi","raw_affiliation_strings":["Meta"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016704219","display_name":"Vikas Chandra","orcid":"https://orcid.org/0009-0005-4996-8455"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vikas Chandra","raw_affiliation_strings":["Meta"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meta","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5070892237"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.5122,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.95582926,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.8460999727249146,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.8460999727249146,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.7709000110626221,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7522708773612976},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.33995088934898376},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.3304826021194458}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7522708773612976},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.33995088934898376},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.3304826021194458}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/mlsp58920.2024.10734721","is_oa":false,"landing_page_url":"https://doi.org/10.1109/mlsp58920.2024.10734721","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 34th International Workshop on Machine Learning for Signal Processing (MLSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W2526050071","https://openalex.org/W2560662850","https://openalex.org/W2912947616","https://openalex.org/W2963807156","https://openalex.org/W2964345931","https://openalex.org/W3015371781","https://openalex.org/W3046890131","https://openalex.org/W3205475937","https://openalex.org/W4221167396","https://openalex.org/W4281758439","https://openalex.org/W4288099666","https://openalex.org/W4307323391","https://openalex.org/W4372348103","https://openalex.org/W4380136719","https://openalex.org/W4381786045","https://openalex.org/W4383045354","https://openalex.org/W4385245566","https://openalex.org/W4386071707","https://openalex.org/W4393160294","https://openalex.org/W4396877837","https://openalex.org/W6757220786","https://openalex.org/W6791353385","https://openalex.org/W6802805937","https://openalex.org/W6810265253","https://openalex.org/W6838322825","https://openalex.org/W6840815571","https://openalex.org/W6845479124","https://openalex.org/W6849109464","https://openalex.org/W6853096648","https://openalex.org/W6854445938"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Recent":[0],"advancements":[1],"in":[2,170,246],"audio":[3,79,92,140,156,178,241],"generation":[4,31,66,90,172],"tasks,":[5],"such":[6],"as":[7],"text-to-audio":[8],"and":[9,22,47,50,86,237],"text-to-music":[10],"generation,":[11],"have":[12],"been":[13],"spurred":[14],"by":[15,96,118,124],"the":[16,26,40,44,51,120,125,130,155,161,165,174],"evolution":[17],"of":[18,28,39,91,164,176,206],"deep":[19],"learning":[20],"models":[21,229],"large-scale":[23],"datasets.":[24],"However,":[25],"task":[27],"video-to-audio":[29],"(V2A)":[30],"continues":[32],"to":[33,142,144,154,192],"be":[34,244],"a":[35,70,97,109,135],"challenge,":[36],"principally":[37],"because":[38],"intricate":[41],"relationship":[42],"between":[43,84],"high-dimensional":[45],"visual":[46,105,110,121,126,145,151,181,189,208],"auditory":[48],"data,":[49],"challenges":[52],"associated":[53],"with":[54,129,179],"temporal":[55],"synchronization.":[56,198],"In":[57,147],"this":[58],"study,":[59],"we":[60,186,200],"introduce":[61],"FoleyGen,":[62],"an":[63,76,203],"open-domain":[64],"V2A":[65,171],"system":[67],"built":[68],"on":[69,104,196,212,221],"language":[71],"modeling":[72],"paradigm.":[73],"FoleyGen":[74,112,228],"leverages":[75],"off-the-shelf":[77],"neural":[78],"codec":[80],"for":[81],"bidirectional":[82],"conversion":[83],"waveforms":[85],"discrete":[87],"tokens.":[88],"The":[89,218,240],"tokens":[93,141],"is":[94,102,173],"facilitated":[95],"single":[98],"Transformer":[99,131],"model,":[100],"which":[101],"conditioned":[103],"features":[106,113,122,152],"extracted":[107,123],"from":[108],"encoder.":[111],"two":[114],"distinct":[115],"versions,":[116],"differentiated":[117],"how":[119],"encoder":[127],"interact":[128],"model.":[132],"FoleyGen-C":[133],"employs":[134],"cross-attention":[136],"module":[137],"that":[138,225],"enables":[139],"attend":[143],"features.":[146],"contrast,":[148],"FoleyGen-P":[149],"appends":[150],"directly":[153],"tokens,":[157],"allowing":[158],"interactions":[159],"within":[160],"self-attention":[162],"mechanism":[163],"Transformer.":[166],"A":[167],"significant":[168],"challenge":[169],"misalignment":[175],"generated":[177],"corresponding":[180],"actions.":[182],"To":[183],"address":[184],"this,":[185],"develop":[187],"three":[188],"attention":[190],"mechanisms":[191],"assess":[193],"their":[194],"impact":[195],"audio-visual":[197],"Additionally,":[199],"further":[201],"undertake":[202],"exhaustive":[204],"evaluation":[205],"multiple":[207],"encoders,":[209],"each":[210],"pretrained":[211],"either":[213],"single-modal":[214],"or":[215],"multi-modal":[216],"tasks.":[217],"experimental":[219],"results":[220],"VGGSound":[222],"dataset":[223],"show":[224],"our":[226,247],"proposed":[227],"outperforms":[230],"previous":[231],"systems":[232],"across":[233],"all":[234],"objective":[235],"metrics":[236],"human":[238],"evaluations.":[239],"samples":[242],"can":[243],"found":[245],"demo":[248],"page:":[249],"https://xinhaomei.github.io/foleygen_demo/.":[250]},"counts_by_year":[{"year":2026,"cited_by_count":7},{"year":2025,"cited_by_count":7}],"updated_date":"2026-05-07T13:39:58.223016","created_date":"2025-10-10T00:00:00"}
