{"id":"https://openalex.org/W4406023797","doi":"https://doi.org/10.48550/arxiv.2408.08093","title":"When Video Coding Meets Multimodal Large Language Models: A Unified Paradigm for Video Coding","display_name":"When Video Coding Meets Multimodal Large Language Models: A Unified Paradigm for Video Coding","publication_year":2024,"publication_date":"2024-08-15","ids":{"openalex":"https://openalex.org/W4406023797","doi":"https://doi.org/10.48550/arxiv.2408.08093"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2408.08093","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.08093","pdf_url":"https://arxiv.org/pdf/2408.08093","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2408.08093","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100451860","display_name":"Pingping Zhang","orcid":"https://orcid.org/0000-0003-4188-1572"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Pingping","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100454990","display_name":"Jinlong Li","orcid":"https://orcid.org/0000-0003-2279-9506"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Jinlong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Chen, Kecheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Kecheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100377170","display_name":"Meng Wang","orcid":"https://orcid.org/0000-0002-5655-1464"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Meng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Xu, Long","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Long","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Haoliang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Haoliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027171279","display_name":"Nicu Sebe","orcid":"https://orcid.org/0000-0002-6597-7248"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sebe, Nicu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008386708","display_name":"Sam Kwong","orcid":"https://orcid.org/0000-0001-7484-7261"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kwong, Sam","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100385178","display_name":"Shiqi Wang","orcid":"https://orcid.org/0000-0002-3583-959X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Shiqi","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5100451860"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.5199000239372253,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.5199000239372253,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.6981897950172424},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6940191984176636},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.33460283279418945},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3277130722999573},{"id":"https://openalex.org/keywords/sociology","display_name":"Sociology","score":0.07888221740722656}],"concepts":[{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.6981897950172424},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6940191984176636},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.33460283279418945},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3277130722999573},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.07888221740722656},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2408.08093","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.08093","pdf_url":"https://arxiv.org/pdf/2408.08093","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2408.08093","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2408.08093","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2408.08093","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.08093","pdf_url":"https://arxiv.org/pdf/2408.08093","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4406023797.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Existing":[0],"codecs":[1],"are":[2,76,99],"designed":[3],"to":[4,8,49,82,101,119,128,150,160],"eliminate":[5],"intrinsic":[6],"redundancies":[7],"create":[9,102],"a":[10,36,46,67],"compact":[11,85],"representation":[12,52,86],"for":[13,39,111,142,184],"compression.":[14,32],"However,":[15],"strong":[16],"external":[17],"priors":[18],"from":[19],"Multimodal":[20],"Large":[21],"Language":[22],"Models":[23],"(MLLMs)":[24],"have":[25],"not":[26],"been":[27],"explicitly":[28],"explored":[29],"in":[30,57,187],"video":[31,54,58,68,96,108,188],"Herein,":[33],"we":[34,65,135],"introduce":[35],"unified":[37],"paradigm":[38],"Cross-Modality":[40],"Video":[41],"Coding":[42],"(CMVC),":[43],"which":[44,75,154],"is":[45],"pioneering":[47],"approach":[48],"explore":[50],"multimodality":[51],"and":[53,72,95,124],"generative":[55],"models":[56,98],"coding.":[59,189],"Specifically,":[60],"on":[61,164],"the":[62,156],"encoder":[63],"side,":[64],"disentangle":[66],"into":[69,79],"spatial":[70],"content":[71],"motion":[73,158],"components,":[74],"subsequently":[77],"transformed":[78],"distinct":[80],"modalities":[81],"achieve":[83,129],"very":[84],"by":[87],"leveraging":[88],"MLLMs.":[89],"During":[90],"decoding,":[91],"previously":[92],"encoded":[93],"components":[94],"generation":[97],"leveraged":[100],"multiple":[103],"encoding-decoding":[104],"modes":[105],"that":[106,167],"optimize":[107],"reconstruction":[109],"quality":[110],"specific":[112],"decoding":[113],"requirements,":[114],"including":[115],"Text-Text-to-Video":[116],"(TT2V)":[117],"mode":[118,127,144],"ensure":[120],"high-quality":[121],"semantic":[122,171],"information":[123],"Image-Text-to-Video":[125],"(IT2V)":[126],"superb":[130],"perceptual":[131,152,177],"consistency.":[132,178],"In":[133],"addition,":[134],"propose":[136],"an":[137],"efficient":[138],"frame":[139],"interpolation":[140],"model":[141],"IT2V":[143,174],"via":[145],"Low-Rank":[146],"Adaption":[147],"(LoRA)":[148],"tuning":[149],"guarantee":[151],"quality,":[153],"allows":[155],"generated":[157],"cues":[159],"behave":[161],"smoothly.":[162],"Experiments":[163],"benchmarks":[165],"indicate":[166],"TT2V":[168],"achieves":[169],"effective":[170],"reconstruction,":[172],"while":[173],"exhibits":[175],"competitive":[176],"These":[179],"results":[180],"highlight":[181],"potential":[182],"directions":[183],"future":[185],"research":[186]},"counts_by_year":[],"updated_date":"2026-03-25T23:56:10.502304","created_date":"2025-10-10T00:00:00"}
