{"id":"https://openalex.org/W4416250979","doi":"https://doi.org/10.1109/waspaa66052.2025.11230970","title":"SpecMaskFoley: Steering Pretrained Spectral Masked Generative Transformer Toward Synchronized Video-to-audio Synthesis via ControlNet","display_name":"SpecMaskFoley: Steering Pretrained Spectral Masked Generative Transformer Toward Synchronized Video-to-audio Synthesis via ControlNet","publication_year":2025,"publication_date":"2025-10-12","ids":{"openalex":"https://openalex.org/W4416250979","doi":"https://doi.org/10.1109/waspaa66052.2025.11230970"},"language":null,"primary_location":{"id":"doi:10.1109/waspaa66052.2025.11230970","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230970","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5040961663","display_name":"Zhi Zhong","orcid":"https://orcid.org/0000-0002-8730-5530"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhi Zhong","raw_affiliation_strings":["Sony Group Corporation,Japan"],"affiliations":[{"raw_affiliation_string":"Sony Group Corporation,Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050144186","display_name":"Akira Takahashi","orcid":"https://orcid.org/0000-0003-4289-9838"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Akira Takahashi","raw_affiliation_strings":["Sony Group Corporation,Japan"],"affiliations":[{"raw_affiliation_string":"Sony Group Corporation,Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050673381","display_name":"Shuyang Cui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shuyang Cui","raw_affiliation_strings":["Sony Group Corporation,Japan"],"affiliations":[{"raw_affiliation_string":"Sony Group Corporation,Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050224316","display_name":"Keisuke Toyama","orcid":"https://orcid.org/0000-0003-3632-2530"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Keisuke Toyama","raw_affiliation_strings":["Sony Group Corporation,Japan"],"affiliations":[{"raw_affiliation_string":"Sony Group Corporation,Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104117184","display_name":"Shusuke Takahashi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shusuke Takahashi","raw_affiliation_strings":["Sony Group Corporation,Japan"],"affiliations":[{"raw_affiliation_string":"Sony Group Corporation,Japan","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5088754502","display_name":"Yuki Mitsufuji","orcid":"https://orcid.org/0000-0002-6806-6140"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuki Mitsufuji","raw_affiliation_strings":["Sony Group Corporation,Japan"],"affiliations":[{"raw_affiliation_string":"Sony Group Corporation,Japan","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5040961663"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.37378254,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.7342000007629395,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.7342000007629395,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.07509999722242355,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.0608999989926815,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/foley","display_name":"Foley","score":0.791100025177002},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.6804999709129333},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6043999791145325},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.5939000248908997},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5177000164985657},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.47119998931884766}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8126999735832214},{"id":"https://openalex.org/C110446960","wikidata":"https://www.wikidata.org/wiki/Q762316","display_name":"Foley","level":2,"score":0.791100025177002},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.6804999709129333},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6043999791145325},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5956000089645386},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.5939000248908997},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5177000164985657},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.47119998931884766},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.44530001282691956},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4050999879837036},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.39419999718666077},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3361000120639801},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.31450000405311584},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.27869999408721924}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/waspaa66052.2025.11230970","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230970","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W2526050071","https://openalex.org/W2593116425","https://openalex.org/W3015371781","https://openalex.org/W3094550259","https://openalex.org/W3205475937","https://openalex.org/W4313021454","https://openalex.org/W4313156423","https://openalex.org/W4372260310","https://openalex.org/W4386071584","https://openalex.org/W4386071707","https://openalex.org/W4386075767","https://openalex.org/W4386763840","https://openalex.org/W4390873054","https://openalex.org/W4392904237","https://openalex.org/W4392904679","https://openalex.org/W4393160294","https://openalex.org/W4402727849","https://openalex.org/W4408345573","https://openalex.org/W4408345630","https://openalex.org/W4408345930","https://openalex.org/W4408352828","https://openalex.org/W4408356092","https://openalex.org/W4409364205","https://openalex.org/W4411245083","https://openalex.org/W4413145987","https://openalex.org/W4415796144"],"related_works":[],"abstract_inverted_index":{"Foley":[0],"synthesis":[1,54,133,185,203],"aims":[2],"to":[3,66,73,82],"synthesize":[4],"high-quality":[5],"audio":[6,41,48],"that":[7,124,188],"is":[8],"both":[9],"semantically":[10],"and":[11,111,154],"temporally":[12],"aligned":[13],"with":[14],"video":[15,101,152],"frames.":[16],"Given":[17],"its":[18,77],"broad":[19],"application":[20],"in":[21,30,177],"creative":[22],"industries,":[23],"the":[24,31,36,126,138,147,150,155,159,169,198],"task":[25,38],"has":[26,70,79],"gained":[27],"increasing":[28],"attention":[29],"research":[32],"community.":[33],"To":[34,115,136],"avoid":[35],"non-trivial":[37],"of":[39,140,158,200],"training":[40],"generative":[42,49,68],"models":[43,50,90],"from":[44],"scratch,":[45],"adapting":[46],"pretrained":[47,67,100,127,160],"for":[51,62,171],"video-synchronized":[52,131],"foley":[53,74,113,132,184,202],"presents":[55],"an":[56],"attractive":[57],"direction.":[58],"ControlNet,":[59],"a":[60,106,122,141,163,182],"method":[61,123],"adding":[63],"fine-grained":[64],"controls":[65],"models,":[69],"been":[71,80],"applied":[72],"synthesis,":[75],"but":[76],"use":[78],"limited":[81],"handcrafted":[83],"human-readable":[84],"temporal":[85,151,165],"conditions.":[86],"In":[87],"contrast,":[88],"from-scratch":[89,112,194],"achieved":[91],"success":[92],"by":[93],"leveraging":[94],"high-dimensional":[95],"deep":[96],"features":[97,153],"extracted":[98],"using":[99],"encoders.":[102],"We":[103],"have":[104],"observed":[105],"performance":[107],"gap":[108],"between":[109,149],"ControlNet-based":[110,201],"models.":[114,204],"narrow":[116],"this":[117],"gap,":[118],"we":[119,145],"propose":[120],"SpecMaskFoley,":[121],"steers":[125],"SpecMaskGIT":[128,161],"model":[129],"toward":[130],"via":[134,162],"ControlNet.":[135],"unlock":[137],"potential":[139],"single":[142],"ControlNet":[143],"branch,":[144],"resolve":[146],"discrepancy":[148],"time-frequency":[156],"nature":[157],"frequency-aware":[164],"feature":[166],"aligner,":[167],"eliminating":[168],"need":[170],"complicated":[172],"conditioning":[173],"mechanisms":[174],"widely":[175],"used":[176],"prior":[178],"arts.":[179],"Evaluations":[180],"on":[181],"common":[183],"benchmark":[186],"demonstrate":[187],"SpecMaskFoley":[189],"could":[190],"even":[191],"outperform":[192],"strong":[193],"baselines,":[195],"substantially":[196],"advancing":[197],"development":[199],"Demo":[205],"page:":[206],"https://zzaudio.github.io/SpecMaskFoley_Demo/.":[207]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-14T00:00:00"}
