{"id":"https://openalex.org/W7140117962","doi":"https://doi.org/10.48550/arxiv.2603.19370","title":"VAMPO: Policy Optimization for Improving Visual Dynamics in Video Action Models","display_name":"VAMPO: Policy Optimization for Improving Visual Dynamics in Video Action Models","publication_year":2026,"publication_date":"2026-03-19","ids":{"openalex":"https://openalex.org/W7140117962","doi":"https://doi.org/10.48550/arxiv.2603.19370"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.19370","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.19370","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.19370","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5044674916","display_name":"Zirui Ge","orcid":"https://orcid.org/0000-0001-6307-1330"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ge, Zirui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130407971","display_name":"Pengxiang Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Pengxiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101341072","display_name":"Baohua Yin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yin, Baohua","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130382640","display_name":"Qishen Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Qishen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130329271","display_name":"Zhiyong Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Zhiyong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130399573","display_name":"Yemin Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yemin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121100896","display_name":"Jinbo Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jinbo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111111965","display_name":"Hengtao Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Hengtao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083582113","display_name":"Runze Suo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Suo, Runze","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130360657","display_name":"Wenxuan Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Wenxuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130343185","display_name":"Han Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Han","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Lyu, Shangke","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyu, Shangke","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130349511","display_name":"Zhaoxin Fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Zhaoxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130351526","display_name":"Haoang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Haoang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130391823","display_name":"Ran Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Ran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130346354","display_name":"Cheng Chi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chi, Cheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034953387","display_name":"Huibin Ge","orcid":"https://orcid.org/0000-0003-1230-520X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ge, Huibin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055246977","display_name":"Y. H. Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Yaozhi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130406917","display_name":"Donglin Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Donglin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":19,"corresponding_author_ids":["https://openalex.org/A5044674916"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.2718999981880188,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.2718999981880188,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.257999986410141,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.19509999454021454,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.5358999967575073},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5076000094413757},{"id":"https://openalex.org/keywords/coherence","display_name":"Coherence (philosophical gambling strategy)","score":0.46000000834465027},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.4235999882221222},{"id":"https://openalex.org/keywords/downstream","display_name":"Downstream (manufacturing)","score":0.399399995803833},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.39890000224113464},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.38589999079704285},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.36010000109672546}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7688000202178955},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5924999713897705},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.5358999967575073},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5076000094413757},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4731999933719635},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.46000000834465027},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4235999882221222},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.399399995803833},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.39890000224113464},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.38589999079704285},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.36010000109672546},{"id":"https://openalex.org/C160086991","wikidata":"https://www.wikidata.org/wiki/Q5939193","display_name":"Human visual system model","level":3,"score":0.34860000014305115},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.3407000005245209},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.3321000039577484},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.32829999923706055},{"id":"https://openalex.org/C106189395","wikidata":"https://www.wikidata.org/wiki/Q176789","display_name":"Markov decision process","level":3,"score":0.32420000433921814},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.3028999865055084},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.2973000109195709},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.28290000557899475},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.2809000015258789},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.2782000005245209},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.26910001039505005},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2669999897480011},{"id":"https://openalex.org/C77405623","wikidata":"https://www.wikidata.org/wiki/Q598451","display_name":"System dynamics","level":2,"score":0.2606000006198883},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.25360000133514404}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.19370","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.19370","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.19370","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.19370","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.7337145209312439}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Video":[0],"action":[1,89,184],"models":[2,90],"are":[3,33],"an":[4,129],"appealing":[5],"foundation":[6],"for":[7,51],"Vision--Language--Action":[8],"systems":[9],"because":[10],"they":[11],"can":[12,70],"learn":[13],"visual":[14,48,85,117,178],"dynamics":[15,49,86,118],"from":[16],"large-scale":[17],"video":[18,31,88],"data":[19],"and":[20,66,107,163,171,186],"transfer":[21],"this":[22,124,159],"knowledge":[23],"to":[24,58,98,181],"downstream":[25,74,183],"robot":[26],"control.":[27],"Yet":[28],"current":[29],"diffusion-based":[30],"predictors":[32],"trained":[34],"with":[35,112,161],"likelihood-surrogate":[36],"objectives,":[37],"which":[38],"encourage":[39],"globally":[40],"plausible":[41],"predictions":[42],"without":[43],"explicitly":[44],"optimizing":[45],"the":[46,109,138,149,152],"precision-critical":[47],"needed":[50],"manipulation.":[52],"This":[53],"objective":[54],"mismatch":[55],"often":[56],"leads":[57],"subtle":[59],"errors":[60],"in":[61,87,119],"object":[62],"pose,":[63],"spatial":[64],"relations,":[65],"contact":[67],"timing":[68],"that":[69,82,133],"be":[71],"amplified":[72],"by":[73],"policies.":[75],"We":[76,156],"propose":[77],"VAMPO,":[78],"a":[79,103,164],"post-training":[80],"framework":[81],"directly":[83],"improves":[84,176],"through":[91],"policy":[92,111],"optimization.":[93],"Our":[94],"key":[95],"idea":[96],"is":[97,191],"formulate":[99],"multi-step":[100],"denoising":[101,110,140,154],"as":[102],"sequential":[104],"decision":[105],"process":[106],"optimize":[108],"rewards":[113],"defined":[114],"over":[115],"expert":[116],"latent":[120],"space.":[121],"To":[122],"make":[123],"optimization":[125],"practical,":[126],"we":[127],"introduce":[128],"Euler":[130],"Hybrid":[131],"sampler":[132],"injects":[134],"stochasticity":[135],"only":[136],"at":[137],"first":[139],"step,":[141],"enabling":[142],"tractable":[143],"low-variance":[144],"policy-gradient":[145],"estimation":[146],"while":[147],"preserving":[148],"coherence":[150],"of":[151],"remaining":[153],"trajectory.":[155],"further":[157],"combine":[158],"design":[160],"GRPO":[162],"verifiable":[165],"non-adversarial":[166],"reward.":[167],"Across":[168],"diverse":[169],"simulated":[170],"real-world":[172],"manipulation":[173],"tasks,":[174],"VAMPO":[175],"task-relevant":[177],"dynamics,":[179],"leading":[180],"better":[182],"generation":[185],"stronger":[187],"generalization.":[188],"The":[189],"homepage":[190],"https://vampo-robot.github.io/VAMPO/.":[192]},"counts_by_year":[],"updated_date":"2026-05-03T08:25:01.440150","created_date":"2026-03-24T00:00:00"}
