{"id":"https://openalex.org/W4408352493","doi":"https://doi.org/10.1109/icassp49660.2025.10887573","title":"TA-V2A: Textually Assisted Video-to-Audio Generation","display_name":"TA-V2A: Textually Assisted Video-to-Audio Generation","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408352493","doi":"https://doi.org/10.1109/icassp49660.2025.10887573"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10887573","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10887573","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5069665712","display_name":"You You","orcid":"https://orcid.org/0000-0002-2205-0530"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yuhuan You","raw_affiliation_strings":["Peking University,State Key Laboratory of General Artificial Intelligence School of Intelligence Science and Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,State Key Laboratory of General Artificial Intelligence School of Intelligence Science and Technology,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084685506","display_name":"Xihong Wu","orcid":"https://orcid.org/0009-0004-5236-7469"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xihong Wu","raw_affiliation_strings":["Peking University,State Key Laboratory of General Artificial Intelligence School of Intelligence Science and Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,State Key Laboratory of General Artificial Intelligence School of Intelligence Science and Technology,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103080532","display_name":"Tianshu Qu","orcid":"https://orcid.org/0000-0001-6256-2031"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tianshu Qu","raw_affiliation_strings":["Peking University,State Key Laboratory of General Artificial Intelligence School of Intelligence Science and Technology,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,State Key Laboratory of General Artificial Intelligence School of Intelligence Science and Technology,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5069665712"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":3.2654,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.89949223,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.98580002784729,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.98580002784729,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9417999982833862,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9269000291824341,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6128974556922913},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.35499802231788635},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.34082406759262085}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6128974556922913},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.35499802231788635},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.34082406759262085}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10887573","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10887573","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W2593116425","https://openalex.org/W2990503944","https://openalex.org/W3015371781","https://openalex.org/W3094550259","https://openalex.org/W3160582690","https://openalex.org/W3187009280","https://openalex.org/W4285606530","https://openalex.org/W4312824283","https://openalex.org/W4312933868","https://openalex.org/W4372260310","https://openalex.org/W4386071828","https://openalex.org/W4386075767","https://openalex.org/W4392904694","https://openalex.org/W4393160294","https://openalex.org/W4402111658","https://openalex.org/W4403363072","https://openalex.org/W4403944333","https://openalex.org/W4404545746","https://openalex.org/W4409364205","https://openalex.org/W4410915437","https://openalex.org/W4411245083","https://openalex.org/W6791353385","https://openalex.org/W6795288823","https://openalex.org/W6840815571","https://openalex.org/W6845479124","https://openalex.org/W6847076894","https://openalex.org/W6849109464","https://openalex.org/W6852975727","https://openalex.org/W6854445938","https://openalex.org/W6856794988","https://openalex.org/W6862315687","https://openalex.org/W6869582388","https://openalex.org/W6870122244","https://openalex.org/W6870515680","https://openalex.org/W6955071965"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"As":[0],"artificial":[1],"intelligence-generated":[2],"content":[3,27],"(AIGC)":[4],"continues":[5],"to":[6,77,98,110,132],"evolve,":[7],"video-to-audio":[8,137],"(V2A)":[9],"generation":[10],"has":[11],"emerged":[12],"as":[13,49],"a":[14,38,68],"key":[15],"area":[16],"with":[17],"promising":[18],"applications":[19],"in":[20,42,81],"multimedia":[21],"editing,":[22],"augmented":[23],"reality,":[24],"and":[25,31,74,114,135],"automated":[26,107],"creation.":[28],"While":[29],"Transformer":[30],"Diffusion":[32],"models":[33,51,88],"have":[34],"advanced":[35],"audio":[36],"generation,":[37],"significant":[39],"challenge":[40],"persists":[41],"extracting":[43],"precise":[44],"semantic":[45,79,100,125],"information":[46],"from":[47],"videos,":[48],"current":[50],"often":[52],"lose":[53],"sequential":[54],"context":[55],"by":[56],"relying":[57],"solely":[58],"on":[59],"frame-based":[60],"features.":[61],"To":[62],"address":[63],"this,":[64],"we":[65],"present":[66],"TA-V2A,":[67],"method":[69],"that":[70],"integrates":[71],"language,":[72],"audio,":[73],"video":[75,91],"features":[76],"improve":[78],"representation":[80],"latent":[82],"space.":[83],"By":[84],"incorporating":[85],"large":[86],"language":[87],"for":[89],"enhanced":[90],"comprehension,":[92],"our":[93],"approach":[94],"leverages":[95],"text":[96,108],"guidance":[97],"enrich":[99],"expression.":[101],"Our":[102],"diffusion":[103],"model-based":[104],"system":[105],"utilizes":[106],"modulation":[109],"enhance":[111],"inference":[112],"quality":[113],"efficiency,":[115],"providing":[116],"personalized":[117],"control":[118],"through":[119],"text-guided":[120],"interfaces.":[121],"This":[122],"integration":[123],"enhances":[124],"expression":[126],"while":[127],"ensuring":[128],"temporal":[129],"alignment,":[130],"leading":[131],"more":[133],"accurate":[134],"coherent":[136],"generation.":[138]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2025-10-10T00:00:00"}
