{"id":"https://openalex.org/W4391547560","doi":"https://doi.org/10.1109/tmm.2024.3362149","title":"TA2V: Text-Audio Guided Video Generation","display_name":"TA2V: Text-Audio Guided Video Generation","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4391547560","doi":"https://doi.org/10.1109/tmm.2024.3362149"},"language":"en","primary_location":{"id":"doi:10.1109/tmm.2024.3362149","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2024.3362149","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101414990","display_name":"Minglu Zhao","orcid":"https://orcid.org/0009-0005-3632-4167"},"institutions":[{"id":"https://openalex.org/I111950717","display_name":"Macau University of Science and Technology","ror":"https://ror.org/03jqs2n27","country_code":"MO","type":"education","lineage":["https://openalex.org/I111950717","https://openalex.org/I4391767947"]}],"countries":["MO"],"is_corresponding":true,"raw_author_name":"Minglu Zhao","raw_affiliation_strings":["School of Computer Science and Engineering, Macau University of Science and Technology, Macau, China"],"raw_orcid":"https://orcid.org/0009-0005-3632-4167","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Macau University of Science and Technology, Macau, China","institution_ids":["https://openalex.org/I111950717"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017052768","display_name":"Wenmin Wang","orcid":"https://orcid.org/0000-0003-2664-4413"},"institutions":[{"id":"https://openalex.org/I111950717","display_name":"Macau University of Science and Technology","ror":"https://ror.org/03jqs2n27","country_code":"MO","type":"education","lineage":["https://openalex.org/I111950717","https://openalex.org/I4391767947"]}],"countries":["MO"],"is_corresponding":false,"raw_author_name":"Wenmin Wang","raw_affiliation_strings":["School of Computer Science and Engineering, Macau University of Science and Technology, Macau, China"],"raw_orcid":"https://orcid.org/0000-0003-2664-4413","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Macau University of Science and Technology, Macau, China","institution_ids":["https://openalex.org/I111950717"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007853804","display_name":"Tongbao Chen","orcid":"https://orcid.org/0000-0002-7719-8364"},"institutions":[{"id":"https://openalex.org/I111950717","display_name":"Macau University of Science and Technology","ror":"https://ror.org/03jqs2n27","country_code":"MO","type":"education","lineage":["https://openalex.org/I111950717","https://openalex.org/I4391767947"]}],"countries":["MO"],"is_corresponding":false,"raw_author_name":"Tongbao Chen","raw_affiliation_strings":["School of Computer Science and Engineering, Macau University of Science and Technology, Macau, China"],"raw_orcid":"https://orcid.org/0000-0002-7719-8364","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Macau University of Science and Technology, Macau, China","institution_ids":["https://openalex.org/I111950717"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100422207","display_name":"Rui Zhang","orcid":"https://orcid.org/0009-0004-6589-2510"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rui Zhang","raw_affiliation_strings":["School of Mechanical Engineering, Beijing Institute of Technology, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0004-6589-2510","affiliations":[{"raw_affiliation_string":"School of Mechanical Engineering, Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101653828","display_name":"Ruochen Li","orcid":"https://orcid.org/0000-0002-4341-6474"},"institutions":[{"id":"https://openalex.org/I111950717","display_name":"Macau University of Science and Technology","ror":"https://ror.org/03jqs2n27","country_code":"MO","type":"education","lineage":["https://openalex.org/I111950717","https://openalex.org/I4391767947"]}],"countries":["MO"],"is_corresponding":false,"raw_author_name":"Ruochen Li","raw_affiliation_strings":["School of Computer Science and Engineering, Macau University of Science and Technology, Macau, China"],"raw_orcid":"https://orcid.org/0000-0002-4341-6474","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Macau University of Science and Technology, Macau, China","institution_ids":["https://openalex.org/I111950717"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101414990"],"corresponding_institution_ids":["https://openalex.org/I111950717"],"apc_list":null,"apc_paid":null,"fwci":3.8092,"has_fulltext":false,"cited_by_count":16,"citation_normalized_percentile":{"value":0.9432251,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":"26","issue":null,"first_page":"7250","last_page":"7264"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9958999752998352,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9958999752998352,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9848999977111816,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12720","display_name":"Multimedia Communication and Technology","score":0.9731000065803528,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.877450704574585},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.4945926368236542},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.4101562798023224},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.40320536494255066},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.2101987898349762},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.20904657244682312}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.877450704574585},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.4945926368236542},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.4101562798023224},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.40320536494255066},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.2101987898349762},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.20904657244682312}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2024.3362149","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2024.3362149","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":92,"referenced_works":["https://openalex.org/W24089286","https://openalex.org/W967544008","https://openalex.org/W1522301498","https://openalex.org/W1583912456","https://openalex.org/W1901129140","https://openalex.org/W2079735306","https://openalex.org/W2117539524","https://openalex.org/W2191779130","https://openalex.org/W2593116425","https://openalex.org/W2619947201","https://openalex.org/W2752796333","https://openalex.org/W2759171953","https://openalex.org/W2896457183","https://openalex.org/W2902437806","https://openalex.org/W2908510526","https://openalex.org/W2963524571","https://openalex.org/W3015371781","https://openalex.org/W3083173864","https://openalex.org/W3096831136","https://openalex.org/W3101943858","https://openalex.org/W3110013267","https://openalex.org/W3115255359","https://openalex.org/W3125775899","https://openalex.org/W3129651364","https://openalex.org/W3153469116","https://openalex.org/W3157199281","https://openalex.org/W3160431667","https://openalex.org/W3161635175","https://openalex.org/W3165405144","https://openalex.org/W3174807077","https://openalex.org/W3176445421","https://openalex.org/W3180355996","https://openalex.org/W3215495615","https://openalex.org/W3216352822","https://openalex.org/W4221155660","https://openalex.org/W4224035735","https://openalex.org/W4281632497","https://openalex.org/W4283705858","https://openalex.org/W4294541506","https://openalex.org/W4297798428","https://openalex.org/W4298185919","https://openalex.org/W4303440777","https://openalex.org/W4310695675","https://openalex.org/W4312393956","https://openalex.org/W4312633146","https://openalex.org/W4312655926","https://openalex.org/W4312872987","https://openalex.org/W4312913021","https://openalex.org/W4362514612","https://openalex.org/W4366196985","https://openalex.org/W4366457372","https://openalex.org/W4375957555","https://openalex.org/W4377372281","https://openalex.org/W4384078665","https://openalex.org/W4384161812","https://openalex.org/W4384519430","https://openalex.org/W4385565405","https://openalex.org/W4386076291","https://openalex.org/W4386162736","https://openalex.org/W4387968055","https://openalex.org/W4390872117","https://openalex.org/W4390873135","https://openalex.org/W4393147998","https://openalex.org/W4402727429","https://openalex.org/W6600983433","https://openalex.org/W6625168331","https://openalex.org/W6631190155","https://openalex.org/W6635084905","https://openalex.org/W6679045638","https://openalex.org/W6714644935","https://openalex.org/W6755207826","https://openalex.org/W6756789066","https://openalex.org/W6757817989","https://openalex.org/W6765585376","https://openalex.org/W6765779288","https://openalex.org/W6779823529","https://openalex.org/W6783182287","https://openalex.org/W6791353385","https://openalex.org/W6794385437","https://openalex.org/W6809885388","https://openalex.org/W6810125463","https://openalex.org/W6810793953","https://openalex.org/W6810940779","https://openalex.org/W6838785959","https://openalex.org/W6844305113","https://openalex.org/W6845281891","https://openalex.org/W6850636897","https://openalex.org/W6851962417","https://openalex.org/W6851978907","https://openalex.org/W6852335179","https://openalex.org/W6852975727","https://openalex.org/W6955071965"],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W2382290278","https://openalex.org/W2478288626","https://openalex.org/W2350741829","https://openalex.org/W2530322880","https://openalex.org/W1596801655"],"abstract_inverted_index":{"Recent":[0],"conditional":[1],"and":[2,18,58,86,115,170,192,208,228,234],"unconditional":[3],"video":[4,42,72,93,113,118,130,160,180],"generation":[5,73,145],"tasks":[6],"have":[7],"been":[8,63],"accomplished":[9],"mainly":[10],"based":[11],"on":[12],"generative":[13],"adversarial":[14],"network":[15],"(GAN),":[16],"diffusion,":[17],"autoregressive":[19,140],"models.":[20],"However,":[21],"in":[22,35,146],"some":[23],"circumstances,":[24],"using":[25],"only":[26],"one":[27],"modality":[28],"cannot":[29],"provide":[30],"enough":[31],"semantic":[32,164,229],"information.":[33,230],"Therefore,":[34],"this":[36],"paper,":[37],"we":[38,100,151],"propose":[39,101],"text-audio":[40],"to":[41,69,127,132,142,157,167],"(TA2V)":[43],"generation,":[44,71],"a":[45,75,111,116,124,133,153,175,193],"new":[46],"task":[47,77],"for":[48,91],"generating":[49],"realistic":[50],"videos":[51,224],"from":[52],"two":[53,109],"different":[54],"guided":[55],"modalities,":[56],"text":[57],"audio,":[59,207],"which":[60,106,202],"has":[61],"not":[62],"explored":[64],"much":[65],"thus":[66],"far.":[67],"Compared":[68],"image":[70],"is":[74],"harder":[76],"because":[78],"of":[79,82,108,184,201],"the":[80,102,147,158,168,182],"complexity":[81],"processing":[83],"higher-dimensional":[84],"data":[85,131],"scarcer":[87],"suitable":[88],"datasets,":[89],"especially":[90],"multimodal":[92],"generation.":[94],"To":[95],"overcome":[96],"these":[97],"limitations,":[98],"(i)":[99],"Text&Audio-guided-Video-Maker":[103],"(TAgVM)":[104],"model,":[105],"consists":[107],"modules:":[110],"text-guided":[112],"generator":[114],"text&audio-guided":[117,154],"modifier.":[119],"(ii)":[120],"This":[121],"model":[122,141,156,221],"uses":[123],"3D":[125],"VQ-GAN":[126],"compress":[128],"high-dimension":[129],"low-dimension":[134],"discrete":[135],"sequence,":[136],"followed":[137],"by":[138],"an":[139],"guide":[143],"text-conditional":[144],"latent":[148],"space.":[149],"Then,":[150],"apply":[152],"diffusion":[155],"generated":[159],"scenes,":[161],"providing":[162],"additional":[163],"details":[165],"corresponding":[166],"audio":[169],"text.":[171],"(iii)":[172],"We":[173],"introduce":[174],"newly":[176],"produced":[177],"music":[178],"performance":[179],"dataset,":[181,195],"University":[183],"Rochester":[185],"Multimodal":[186],"Music":[187],"Performance":[188],"with":[189,197,213,225],"Video-Audio-Text":[190,198],"(URMP-VAT),":[191],"landscape":[194],"Landscape":[196],"(Landscape-VAT),":[199],"both":[200],"include":[203],"three":[204],"modalities":[205],"(text,":[206],"video)":[209],"that":[210,219],"are":[211,236],"aligned":[212],"each":[214],"other.":[215],"The":[216,231],"results":[217],"demonstrate":[218],"our":[220],"can":[222],"create":[223],"satisfactory":[226],"quality":[227],"source":[232],"code":[233],"datasets":[235],"available":[237],"at":[238],"<uri":[239],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[240],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">https://github.com/Minglu58/TA2V.</uri>":[241]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":13},{"year":2024,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
