{"id":"https://openalex.org/W4403780456","doi":"https://doi.org/10.1109/tcsvt.2025.3635224","title":"Multi-Modal Generative AI: Multi-Modal LLMs, Diffusions, and the Unification","display_name":"Multi-Modal Generative AI: Multi-Modal LLMs, Diffusions, and the Unification","publication_year":2025,"publication_date":"2025-11-20","ids":{"openalex":"https://openalex.org/W4403780456","doi":"https://doi.org/10.1109/tcsvt.2025.3635224"},"language":"en","primary_location":{"id":"doi:10.1109/tcsvt.2025.3635224","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2025.3635224","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2409.14993","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Xin Wang","orcid":"https://orcid.org/0000-0002-0351-2939"},"institutions":[{"id":"https://openalex.org/I78675632","display_name":"Beijing Information Science & Technology University","ror":"https://ror.org/04xnqep60","country_code":"CN","type":"education","lineage":["https://openalex.org/I78675632"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xin Wang","raw_affiliation_strings":["Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-0351-2939","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089","https://openalex.org/I78675632"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101273536","display_name":"Yuwei Zhou","orcid":"https://orcid.org/0000-0001-9582-7331"},"institutions":[{"id":"https://openalex.org/I78675632","display_name":"Beijing Information Science & Technology University","ror":"https://ror.org/04xnqep60","country_code":"CN","type":"education","lineage":["https://openalex.org/I78675632"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuwei Zhou","raw_affiliation_strings":["Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-9582-7331","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089","https://openalex.org/I78675632"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Bin Huang","orcid":"https://orcid.org/0009-0000-2504-3689"},"institutions":[{"id":"https://openalex.org/I78675632","display_name":"Beijing Information Science & Technology University","ror":"https://ror.org/04xnqep60","country_code":"CN","type":"education","lineage":["https://openalex.org/I78675632"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bin Huang","raw_affiliation_strings":["Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0000-2504-3689","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089","https://openalex.org/I78675632"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Hong Chen","orcid":"https://orcid.org/0000-0002-0943-2286"},"institutions":[{"id":"https://openalex.org/I78675632","display_name":"Beijing Information Science & Technology University","ror":"https://ror.org/04xnqep60","country_code":"CN","type":"education","lineage":["https://openalex.org/I78675632"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hong Chen","raw_affiliation_strings":["Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-0943-2286","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089","https://openalex.org/I78675632"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100339293","display_name":"Wenwu Zhu","orcid":"https://orcid.org/0000-0003-2236-9290"},"institutions":[{"id":"https://openalex.org/I78675632","display_name":"Beijing Information Science & Technology University","ror":"https://ror.org/04xnqep60","country_code":"CN","type":"education","lineage":["https://openalex.org/I78675632"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenwu Zhu","raw_affiliation_strings":["Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-2236-9290","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Beijing Information Science and Technology National Research Center, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089","https://openalex.org/I78675632"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I78675632","https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":6.4082,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.95376281,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":97},"biblio":{"volume":"36","issue":"4","first_page":"5621","last_page":"5641"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9933000206947327,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9933000206947327,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9017000198364258,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.8827128410339355},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.7416752576828003},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4727272093296051},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.41258540749549866},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3537547290325165},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.1581653356552124},{"id":"https://openalex.org/keywords/materials-science","display_name":"Materials science","score":0.14490795135498047}],"concepts":[{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.8827128410339355},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.7416752576828003},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4727272093296051},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.41258540749549866},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3537547290325165},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.1581653356552124},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.14490795135498047},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/tcsvt.2025.3635224","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2025.3635224","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2409.14993","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.14993","pdf_url":"https://arxiv.org/pdf/2409.14993","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2409.14993","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2409.14993","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2409.14993","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.14993","pdf_url":"https://arxiv.org/pdf/2409.14993","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3022012397","display_name":null,"funder_award_id":"BNR2023TD03006","funder_id":"https://openalex.org/F4320329777","funder_display_name":"Beijing National Research Center For Information Science And Technology"},{"id":"https://openalex.org/G8567821897","display_name":null,"funder_award_id":"62222209","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320328373","display_name":"National Key Laboratory Foundation of China","ror":null},{"id":"https://openalex.org/F4320329777","display_name":"Beijing National Research Center For Information Science And Technology","ror":null}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4403780456.pdf","grobid_xml":"https://content.openalex.org/works/W4403780456.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2380075625","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890"],"abstract_inverted_index":{"Multi-modal":[0,23],"generative":[1,60,177,200],"AI":[2,178],"(Artificial":[3],"Intelligence)":[4],"has":[5],"attracted":[6],"increasing":[7],"attention":[8],"from":[9],"both":[10,88],"academia":[11],"and":[12,36,66,71,91,103,125,133,141,148,164],"industry.":[13],"Particularly,":[14],"two":[15],"dominant":[16],"families":[17],"of":[18,46,58,87,131,198],"techniques":[19],"have":[20],"emerged:":[21],"i)":[22],"large":[24],"language":[25],"models":[26,39,122],"(LLMs)":[27],"demonstrate":[28],"impressive":[29],"ability":[30],"for":[31,69,78,123,157,175],"<italic":[32,47],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[33,48],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">multi-modal":[34,49],"understanding</i>;":[35],"ii)":[37],"Diffusion":[38],"exhibit":[40],"remarkable":[41],"multi-modal":[42,59,63,89,100,176,199],"powers":[43],"in":[44],"terms":[45],"generation</i>.":[50],"Therefore,":[51],"this":[52],"paper":[53],"provides":[54],"a":[55,75,84],"comprehensive":[56],"overview":[57],"AI,":[61],"including":[62,95,139],"LLMs,":[64],"diffusions,":[65],"the":[67,117,129,170,195],"unification":[68,130],"understanding":[70,124,132],"generation.":[72,113,126],"To":[73,127],"lay":[74],"solid":[76],"foundation":[77],"unified":[79,121,158],"models,":[80,93,159],"we":[81,115,135,168,184],"first":[82],"provide":[83],"detailed":[85],"review":[86],"LLMs":[90,108],"diffusion":[92],"respectively,":[94],"their":[96,161],"probabilistic":[97],"modeling":[98],"procedure,":[99],"architecture":[101],"design,":[102],"advanced":[104],"applications":[105],"to":[106,194],"image/video":[107],"as":[109,111,144,146],"well":[110,145],"text-to-image/video":[112],"Furthermore,":[114],"explore":[116],"emerging":[118],"efforts":[119],"toward":[120],"achieve":[128],"generation,":[134],"investigate":[136],"key":[137],"designs":[138],"autoregressive-based":[140],"diffusion-based":[142],"modeling,":[143],"dense":[147],"Mixture-of-Experts":[149],"(MoE)":[150],"architectures.":[151],"We":[152],"then":[153],"introduce":[154],"several":[155,186],"strategies":[156],"analyzing":[160],"potential":[162],"advantages":[163],"disadvantages.":[165],"In":[166],"addition,":[167],"summarize":[169],"common":[171],"datasets":[172],"widely":[173],"used":[174],"pretraining.":[179],"Last":[180],"but":[181],"not":[182],"least,":[183],"present":[185],"challenging":[187],"future":[188],"research":[189],"directions":[190],"that":[191],"may":[192],"contribute":[193],"ongoing":[196],"advancement":[197],"AI.":[201]},"counts_by_year":[{"year":2025,"cited_by_count":3}],"updated_date":"2026-04-30T09:15:22.047038","created_date":"2024-10-26T00:00:00"}
