{"id":"https://openalex.org/W7126037578","doi":"https://doi.org/10.48550/arxiv.2601.20564","title":"DiffVC-RT: Towards Practical Real-Time Diffusion-based Perceptual Neural Video Compression","display_name":"DiffVC-RT: Towards Practical Real-Time Diffusion-based Perceptual Neural Video Compression","publication_year":2026,"publication_date":"2026-01-28","ids":{"openalex":"https://openalex.org/W7126037578","doi":"https://doi.org/10.48550/arxiv.2601.20564"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2601.20564","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124191062","display_name":"Wenzhuo Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ma, Wenzhuo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124262832","display_name":"Zhenzhong Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Zhenzhong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5124191062"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.34599998593330383,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.34599998593330383,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11165","display_name":"Image and Video Quality Assessment","score":0.16380000114440918,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10741","display_name":"Video Coding and Compression Technologies","score":0.10970000177621841,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.6948999762535095},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.5540000200271606},{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.551800012588501},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.4787999987602234},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4772999882698059},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.40700000524520874},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.37459999322891235},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.36390000581741333},{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.3569999933242798},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.35659998655319214}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8392999768257141},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.6948999762535095},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.5540000200271606},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.551800012588501},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5091000199317932},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.4787999987602234},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4772999882698059},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.40700000524520874},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3977999985218048},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.37459999322891235},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.36390000581741333},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.3569999933242798},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.35659998655319214},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.34599998593330383},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.3395000100135803},{"id":"https://openalex.org/C23431618","wikidata":"https://www.wikidata.org/wiki/Q1404672","display_name":"Multiview Video Coding","level":4,"score":0.3310000002384186},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3303000032901764},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.31290000677108765},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.3125999867916107},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3102000057697296},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3061999976634979},{"id":"https://openalex.org/C113954288","wikidata":"https://www.wikidata.org/wiki/Q186885","display_name":"Timestamp","level":2,"score":0.30090001225471497},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.3003000020980835},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.29820001125335693},{"id":"https://openalex.org/C161765866","wikidata":"https://www.wikidata.org/wiki/Q184748","display_name":"Codec","level":2,"score":0.2879999876022339},{"id":"https://openalex.org/C2779020251","wikidata":"https://www.wikidata.org/wiki/Q3555171","display_name":"Motion vector","level":3,"score":0.2874000072479248},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.2849000096321106},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.28029999136924744},{"id":"https://openalex.org/C2777851325","wikidata":"https://www.wikidata.org/wiki/Q7094102","display_name":"Online model","level":2,"score":0.2685999870300293},{"id":"https://openalex.org/C106030495","wikidata":"https://www.wikidata.org/wiki/Q1797012","display_name":"Video compression picture types","level":4,"score":0.2628999948501587},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.257099986076355},{"id":"https://openalex.org/C186967261","wikidata":"https://www.wikidata.org/wiki/Q5082128","display_name":"Mobile device","level":2,"score":0.2547000050544739},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.2524000108242035}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2601.20564","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2601.20564","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.20564","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2601.20564","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/1","display_name":"No poverty","score":0.5162843465805054}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"practical":[1],"deployment":[2],"of":[3,139,152],"diffusion-based":[4,37,170],"Neural":[5],"Video":[6],"Compression":[7],"(NVC)":[8],"faces":[9],"critical":[10],"challenges,":[11],"including":[12],"severe":[13],"information":[14,64],"loss,":[15],"prohibitive":[16],"inference":[17],"latency,":[18],"and":[19,45,53,75,106,119,149],"poor":[20],"temporal":[21,81],"consistency.":[22],"To":[23],"bridge":[24],"this":[25,55],"gap,":[26],"we":[27,41,72,102],"propose":[28,73],"DiffVC-RT,":[29],"the":[30,93],"first":[31],"framework":[32],"designed":[33],"to":[34,67],"achieve":[35],"real-time":[36,147],"perceptual":[38],"NVC.":[39],"First,":[40],"introduce":[42],"an":[43,104,161],"Efficient":[44],"Informative":[46],"Model":[47],"Architecture.":[48],"Through":[49],"strategic":[50],"module":[51],"replacements":[52],"pruning,":[54],"architecture":[56],"significantly":[57],"reduces":[58],"computational":[59],"complexity":[60],"while":[61],"mitigating":[62],"structural":[63],"loss.":[65],"Second,":[66],"address":[68],"generative":[69],"flickering":[70],"artifacts,":[71],"Explicit":[74],"Implicit":[76],"Consistency":[77],"Modeling.":[78],"We":[79],"enhance":[80],"consistency":[82,99],"by":[83,96],"explicitly":[84],"incorporating":[85,110],"a":[86,124,166],"zero-cost":[87],"Online":[88],"Temporal":[89,126],"Shift":[90,127],"Module":[91],"within":[92],"U-Net,":[94],"complemented":[95],"hybrid":[97],"implicit":[98],"constraints.":[100],"Finally,":[101],"present":[103],"Asynchronous":[105],"Parallel":[107],"Decoding":[108],"Pipeline":[109],"Mixed":[111],"Half":[112],"Precision,":[113],"which":[114],"enables":[115],"asynchronous":[116],"latent":[117],"decoding":[118,150],"parallel":[120],"frame":[121],"reconstruction":[122],"via":[123],"Batch-dimension":[125],"design.":[128],"Experiments":[129],"show":[130],"that":[131],"DiffVC-RT":[132],"achieves":[133],"80.1%":[134],"bitrate":[135],"savings":[136],"in":[137,169],"terms":[138],"LPIPS":[140],"over":[141],"VTM-17.0":[142],"on":[143,160],"HEVC":[144],"dataset":[145],"with":[146],"encoding":[148],"speeds":[151],"206":[153],"/":[154],"30":[155],"fps":[156],"for":[157],"720p":[158],"videos":[159],"NVIDIA":[162],"H800":[163],"GPU,":[164],"marking":[165],"significant":[167],"milestone":[168],"video":[171],"compression.":[172]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-01-30T00:00:00"}
