{"id":"https://openalex.org/W7154144243","doi":"https://doi.org/10.48550/arxiv.2604.09349","title":"Visually-Guided Policy Optimization for Multimodal Reasoning","display_name":"Visually-Guided Policy Optimization for Multimodal Reasoning","publication_year":2026,"publication_date":"2026-04-10","ids":{"openalex":"https://openalex.org/W7154144243","doi":"https://doi.org/10.48550/arxiv.2604.09349"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.09349","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09349","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.09349","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133511830","display_name":"Zengbin Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zengbin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133537059","display_name":"Feng Xiong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong, Feng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133506901","display_name":"Liang Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Liang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041762969","display_name":"Xuecai Hu","orcid":"https://orcid.org/0000-0003-0483-0418"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Xuecai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100751600","display_name":"Yong Wang","orcid":"https://orcid.org/0000-0002-5383-5736"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133477729","display_name":"Yanlin Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yanlin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133520417","display_name":"Man Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Man","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133518005","display_name":"Xiangxiang Chu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chu, Xiangxiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9222999811172485,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9222999811172485,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.013000000268220901,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.01080000028014183,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.5896000266075134},{"id":"https://openalex.org/keywords/forgetting","display_name":"Forgetting","score":0.5618000030517578},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.46540001034736633},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.44179999828338623},{"id":"https://openalex.org/keywords/visual-attention","display_name":"Visual attention","score":0.43230000138282776},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.4235999882221222},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.41280001401901245},{"id":"https://openalex.org/keywords/visual-learning","display_name":"Visual learning","score":0.39239999651908875}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7149999737739563},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6195999979972839},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.5896000266075134},{"id":"https://openalex.org/C7149132","wikidata":"https://www.wikidata.org/wiki/Q1377840","display_name":"Forgetting","level":2,"score":0.5618000030517578},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.46540001034736633},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.44179999828338623},{"id":"https://openalex.org/C2986089797","wikidata":"https://www.wikidata.org/wiki/Q6501338","display_name":"Visual attention","level":3,"score":0.43230000138282776},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4235999882221222},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.41280001401901245},{"id":"https://openalex.org/C2779321571","wikidata":"https://www.wikidata.org/wiki/Q7936605","display_name":"Visual learning","level":2,"score":0.39239999651908875},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.38609999418258667},{"id":"https://openalex.org/C158495155","wikidata":"https://www.wikidata.org/wiki/Q2369151","display_name":"Visual search","level":2,"score":0.38269999623298645},{"id":"https://openalex.org/C160086991","wikidata":"https://www.wikidata.org/wiki/Q5939193","display_name":"Human visual system model","level":3,"score":0.3386000096797943},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.33000001311302185},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.32580000162124634},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.31459999084472656},{"id":"https://openalex.org/C178278151","wikidata":"https://www.wikidata.org/wiki/Q7936607","display_name":"Visual memory","level":3,"score":0.2858000099658966},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.2842000126838684},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.28369998931884766},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C164280684","wikidata":"https://www.wikidata.org/wiki/Q5529040","display_name":"Gaze-contingency paradigm","level":4,"score":0.25290000438690186},{"id":"https://openalex.org/C2778251979","wikidata":"https://www.wikidata.org/wiki/Q7936617","display_name":"Visual processing","level":3,"score":0.2500999867916107}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.09349","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09349","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.09349","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09349","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Reinforcement":[0],"learning":[1],"with":[2],"verifiable":[3],"rewards":[4],"(RLVR)":[5],"has":[6,156],"significantly":[7],"advanced":[8],"the":[9,17,115,126],"reasoning":[10,48,150],"ability":[11],"of":[12,21],"vision-language":[13],"models":[14],"(VLMs).":[15],"However,":[16],"inherent":[18],"text-dominated":[19],"nature":[20],"VLMs":[22],"often":[23],"leads":[24],"to":[25,34,66,86,100],"insufficient":[26],"visual":[27,35,45,68,84,90,95,102,123,133,142],"faithfulness,":[28],"characterized":[29],"by":[30],"sparse":[31],"attention":[32],"activation":[33,143],"tokens.":[36],"More":[37],"importantly,":[38],"our":[39],"empirical":[40],"analysis":[41],"reveals":[42],"that":[43,82,138],"temporal":[44],"forgetting":[46],"along":[47],"steps":[49,99],"exacerbates":[50],"this":[51,55,106],"deficiency.":[52],"To":[53],"bridge":[54],"gap,":[56],"we":[57,108],"propose":[58],"Visually-Guided":[59],"Policy":[60],"Optimization":[61],"(VGPO),":[62],"a":[63,77,110],"novel":[64],"framework":[65],"reinforce":[67],"focus":[69],"during":[70],"policy":[71],"optimization.":[72],"Specifically,":[73],"VGPO":[74,139],"initially":[75],"introduces":[76],"Visual":[78],"Attention":[79],"Compensation":[80],"mechanism":[81],"leverages":[83],"similarity":[85],"localize":[87],"and":[88,144,151],"amplify":[89],"cues,":[91],"while":[92,125],"progressively":[93],"elevating":[94],"expectations":[96],"in":[97,147],"later":[98],"counteract":[101],"forgetting.":[103],"Building":[104],"on":[105],"mechanism,":[107],"implement":[109],"dual-grained":[111],"advantage":[112],"re-weighting":[113],"strategy:":[114],"intra-trajectory":[116],"level":[117,128],"highlights":[118],"tokens":[119],"exhibiting":[120],"relatively":[121],"high":[122],"activation,":[124],"inter-trajectory":[127],"prioritizes":[129],"trajectories":[130],"demonstrating":[131],"superior":[132,145],"accumulation.":[134],"Extensive":[135],"experiments":[136],"demonstrate":[137],"achieves":[140],"better":[141],"performance":[146],"mathematical":[148],"multimodal":[149],"visual-dependent":[152],"tasks.":[153],"The":[154],"code":[155],"been":[157],"released":[158],"at":[159],"https://github.com/wzb-bupt/VGPO.":[160]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-14T00:00:00"}
