{"id":"https://openalex.org/W4416251270","doi":"https://doi.org/10.1109/ijcnn64981.2025.11228639","title":"A Simple but Strong Baseline for Sounding Video Generation: Effective Adaptation of Audio and Video Diffusion Models for Joint Generation","display_name":"A Simple but Strong Baseline for Sounding Video Generation: Effective Adaptation of Audio and Video Diffusion Models for Joint Generation","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4416251270","doi":"https://doi.org/10.1109/ijcnn64981.2025.11228639"},"language":null,"primary_location":{"id":"doi:10.1109/ijcnn64981.2025.11228639","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn64981.2025.11228639","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113447319","display_name":"Masato Ishii","orcid":null},"institutions":[{"id":"https://openalex.org/I4210122684","display_name":"Sony Computer Science Laboratories","ror":"https://ror.org/02nc46417","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210122684"]},{"id":"https://openalex.org/I4210143797","display_name":"Sony (Japan)","ror":"https://ror.org/04wzv3n59","country_code":"JP","type":"company","lineage":["https://openalex.org/I4210143797"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Masato Ishii","raw_affiliation_strings":["Sony AI,Tokyo,Japan"],"affiliations":[{"raw_affiliation_string":"Sony AI,Tokyo,Japan","institution_ids":["https://openalex.org/I4210122684","https://openalex.org/I4210143797"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091621857","display_name":"Akio Hayakawa","orcid":null},"institutions":[{"id":"https://openalex.org/I4210122684","display_name":"Sony Computer Science Laboratories","ror":"https://ror.org/02nc46417","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210122684"]},{"id":"https://openalex.org/I4210143797","display_name":"Sony (Japan)","ror":"https://ror.org/04wzv3n59","country_code":"JP","type":"company","lineage":["https://openalex.org/I4210143797"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Akio Hayakawa","raw_affiliation_strings":["Sony AI,Tokyo,Japan"],"affiliations":[{"raw_affiliation_string":"Sony AI,Tokyo,Japan","institution_ids":["https://openalex.org/I4210122684","https://openalex.org/I4210143797"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084845461","display_name":"Takashi Shibuya","orcid":"https://orcid.org/0000-0002-4277-0164"},"institutions":[{"id":"https://openalex.org/I4210122684","display_name":"Sony Computer Science Laboratories","ror":"https://ror.org/02nc46417","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210122684"]},{"id":"https://openalex.org/I4210143797","display_name":"Sony (Japan)","ror":"https://ror.org/04wzv3n59","country_code":"JP","type":"company","lineage":["https://openalex.org/I4210143797"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Takashi Shibuya","raw_affiliation_strings":["Sony AI,Tokyo,Japan"],"affiliations":[{"raw_affiliation_string":"Sony AI,Tokyo,Japan","institution_ids":["https://openalex.org/I4210122684","https://openalex.org/I4210143797"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5088754502","display_name":"Yuki Mitsufuji","orcid":"https://orcid.org/0000-0002-6806-6140"},"institutions":[{"id":"https://openalex.org/I4210121096","display_name":"Dexerials (Japan)","ror":"https://ror.org/03p330m15","country_code":"JP","type":"company","lineage":["https://openalex.org/I4210121096"]},{"id":"https://openalex.org/I4210143797","display_name":"Sony (Japan)","ror":"https://ror.org/04wzv3n59","country_code":"JP","type":"company","lineage":["https://openalex.org/I4210143797"]},{"id":"https://openalex.org/I4210122684","display_name":"Sony Computer Science Laboratories","ror":"https://ror.org/02nc46417","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210122684"]},{"id":"https://openalex.org/I1304132090","display_name":"Sony (Taiwan)","ror":"https://ror.org/0214y7014","country_code":"TW","type":"company","lineage":["https://openalex.org/I1304132090","https://openalex.org/I4210143797"]}],"countries":["JP","TW"],"is_corresponding":false,"raw_author_name":"Yuki Mitsufuji","raw_affiliation_strings":["Sony AI / Sony Group Corp.,Tokyo,Japan"],"affiliations":[{"raw_affiliation_string":"Sony AI / Sony Group Corp.,Tokyo,Japan","institution_ids":["https://openalex.org/I4210122684","https://openalex.org/I4210121096","https://openalex.org/I1304132090","https://openalex.org/I4210143797"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5113447319"],"corresponding_institution_ids":["https://openalex.org/I4210122684","https://openalex.org/I4210143797"],"apc_list":null,"apc_paid":null,"fwci":2.856,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.9250427,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"9"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.690500020980835,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.690500020980835,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.08470000326633453,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.08309999853372574,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.6194999814033508},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.6136999726295471},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.5989000201225281},{"id":"https://openalex.org/keywords/depth-sounding","display_name":"Depth sounding","score":0.5630000233650208},{"id":"https://openalex.org/keywords/position","display_name":"Position (finance)","score":0.5001999735832214},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.48750001192092896},{"id":"https://openalex.org/keywords/base","display_name":"Base (topology)","score":0.4507000148296356}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8251000046730042},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.6194999814033508},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.6136999726295471},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.5989000201225281},{"id":"https://openalex.org/C55510283","wikidata":"https://www.wikidata.org/wiki/Q1382947","display_name":"Depth sounding","level":2,"score":0.5630000233650208},{"id":"https://openalex.org/C198082294","wikidata":"https://www.wikidata.org/wiki/Q3399648","display_name":"Position (finance)","level":2,"score":0.5001999735832214},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.48750001192092896},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4691999852657318},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.4507000148296356},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.41449999809265137},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.383899986743927},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3824999928474426},{"id":"https://openalex.org/C192328126","wikidata":"https://www.wikidata.org/wiki/Q4514647","display_name":"Schematic","level":2,"score":0.3222000002861023},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.32120001316070557},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.29159998893737793},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.2842999994754791},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2567000091075897}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ijcnn64981.2025.11228639","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn64981.2025.11228639","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W2963663420","https://openalex.org/W2964345931","https://openalex.org/W2972478942","https://openalex.org/W3007605881","https://openalex.org/W3015371781","https://openalex.org/W3046890131","https://openalex.org/W3180355996","https://openalex.org/W3187009280","https://openalex.org/W4293363567","https://openalex.org/W4312633146","https://openalex.org/W4312655926","https://openalex.org/W4312933868","https://openalex.org/W4362514612","https://openalex.org/W4385245566","https://openalex.org/W4386071656","https://openalex.org/W4386071707","https://openalex.org/W4386071828","https://openalex.org/W4386071957","https://openalex.org/W4386075767","https://openalex.org/W4387195417","https://openalex.org/W4392910528","https://openalex.org/W4393147998","https://openalex.org/W4393160294","https://openalex.org/W4402727849","https://openalex.org/W4403791730","https://openalex.org/W4404439850","https://openalex.org/W4411245083","https://openalex.org/W4415795232","https://openalex.org/W4415798552"],"related_works":[],"abstract_inverted_index":{"In":[0,105],"this":[1],"work,":[2],"we":[3,22,50],"build":[4],"a":[5,29,91,137],"simple":[6],"but":[7],"strong":[8],"baseline":[9],"for":[10,18,141],"sounding":[11],"video":[12],"generation.":[13],"Given":[14],"base":[15,71],"diffusion":[16],"models":[17],"audio":[19,41],"and":[20,32,42,118,159],"video,":[21],"integrate":[23],"them":[24],"with":[25,83,130],"additional":[26,96],"modules":[27],"into":[28,123],"single":[30],"model":[31,38,125],"train":[33],"it":[34,113],"to":[35,69,76],"make":[36],"the":[37,95,119,124,131,145,151,154],"jointly":[39],"generate":[40],"video.":[43],"To":[44],"enhance":[45],"alignment":[46,143],"between":[47],"audio-video":[48],"pairs,":[49],"introduce":[51],"two":[52,155],"novel":[53],"mechanisms":[54,158],"in":[55,144],"our":[56,163],"model.":[57,72],"The":[58,87],"first":[59],"one":[60,89],"is":[61,74,90,109],"timestep":[62,67],"adjustment,":[63],"which":[64],"provides":[65,136],"different":[66],"information":[68,108],"each":[70],"It":[73],"designed":[75],"align":[77],"how":[78],"samples":[79],"are":[80,121],"generated":[81,146],"along":[82],"timesteps":[84],"across":[85],"modalities.":[86],"second":[88],"new":[92],"design":[93],"of":[94,153],"modules,":[97],"termed":[98],"Cross-Modal":[99],"Conditioning":[100],"as":[101,111],"Positional":[102],"Encoding":[103],"(CMC-PE).":[104],"CMC-PE,":[106],"cross-modal":[107],"embedded":[110],"if":[112],"represents":[114],"temporal":[115,142],"position":[116],"information,":[117],"embeddings":[120],"fed":[122],"like":[126],"positional":[127],"encoding.":[128],"Compared":[129],"popular":[132],"cross-attention":[133],"mechanism,":[134],"CMC-PE":[135],"better":[138],"inductive":[139],"bias":[140],"data.":[147],"Experimental":[148],"results":[149],"validate":[150],"effectiveness":[152],"newly":[156],"introduced":[157],"also":[160],"demonstrate":[161],"that":[162],"method":[164],"outperforms":[165],"existing":[166],"methods.":[167]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-14T00:00:00"}
