{"id":"https://openalex.org/W7161676639","doi":"https://doi.org/10.48550/arxiv.2605.18740","title":"Vision-OPD: Learning to See Fine Details for Multimodal LLMs via On-Policy Self-Distillation","display_name":"Vision-OPD: Learning to See Fine Details for Multimodal LLMs via On-Policy Self-Distillation","publication_year":2026,"publication_date":"2026-05-18","ids":{"openalex":"https://openalex.org/W7161676639","doi":"https://doi.org/10.48550/arxiv.2605.18740"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.18740","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18740","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.18740","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5021935122","display_name":"Qianhao Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Qianhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136498457","display_name":"Jie Lou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lou, Jie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136467393","display_name":"Xing Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Xing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136487104","display_name":"Hongyu Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Hongyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136462385","display_name":"Le Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Le","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136458454","display_name":"Xianpei Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Xianpei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136490903","display_name":"Yaojie Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Yaojie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8468999862670898,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8468999862670898,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.03840000182390213,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0215000007301569,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.6617000102996826},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.6021000146865845},{"id":"https://openalex.org/keywords/divergence","display_name":"Divergence (linguistics)","score":0.3853999972343445},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.37940001487731934},{"id":"https://openalex.org/keywords/visual-approach","display_name":"Visual approach","score":0.3709000051021576},{"id":"https://openalex.org/keywords/policy-learning","display_name":"Policy learning","score":0.36719998717308044},{"id":"https://openalex.org/keywords/zoom","display_name":"Zoom","score":0.3626999855041504}],"concepts":[{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.6617000102996826},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.6021000146865845},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5848000049591064},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.48899999260902405},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4361000061035156},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.3853999972343445},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.37940001487731934},{"id":"https://openalex.org/C2777055276","wikidata":"https://www.wikidata.org/wiki/Q7936580","display_name":"Visual approach","level":2,"score":0.3709000051021576},{"id":"https://openalex.org/C2779436431","wikidata":"https://www.wikidata.org/wiki/Q30672407","display_name":"Policy learning","level":2,"score":0.36719998717308044},{"id":"https://openalex.org/C124913957","wikidata":"https://www.wikidata.org/wiki/Q1232548","display_name":"Zoom","level":3,"score":0.3626999855041504},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3582000136375427},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.34929999709129333},{"id":"https://openalex.org/C2779321571","wikidata":"https://www.wikidata.org/wiki/Q7936605","display_name":"Visual learning","level":2,"score":0.3352999985218048},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.33309999108314514},{"id":"https://openalex.org/C2986089797","wikidata":"https://www.wikidata.org/wiki/Q6501338","display_name":"Visual attention","level":3,"score":0.3203999996185303},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.31949999928474426},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3147999942302704},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.2863999903202057},{"id":"https://openalex.org/C8505890","wikidata":"https://www.wikidata.org/wiki/Q605095","display_name":"Budget constraint","level":2,"score":0.2705000042915344},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2651999890804291},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.25589999556541443},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.18740","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18740","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.18740","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18740","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.6754544973373413}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"Large":[1],"Language":[2],"Models":[3],"(MLLMs)":[4],"still":[5],"struggle":[6],"with":[7],"fine-grained":[8,34,155],"visual":[9,138,156],"understanding,":[10],"where":[11],"answers":[12,33],"often":[13],"depend":[14],"on":[15,40,44,58,153],"small":[16],"but":[17],"decisive":[18],"evidence":[19,60],"in":[20],"the":[21,30,45,83,99,120,131,135],"full":[22,47],"image.":[23],"We":[24],"observe":[25],"a":[26,77,102,106],"regional-to-global":[27,78],"perception":[28,88],"gap:":[29],"same":[31,100],"MLLM":[32],"questions":[35],"more":[36],"accurately":[37],"when":[38],"conditioned":[39],"evidence-centered":[41],"crops":[42],"than":[43,62],"corresponding":[46],"images,":[48],"suggesting":[49],"that":[50,81,160],"many":[51],"failures":[52],"stem":[53],"from":[54,98],"difficulty":[55],"to":[56,89,133],"focus":[57],"relevant":[59],"rather":[61],"insufficient":[63],"local":[64],"recognition":[65],"ability.":[66],"Motivated":[67],"by":[68],"this":[69],"observation,":[70],"we":[71],"propose":[72],"Vision-OPD":[73,93,115,161],"(Vision":[74],"On-Policy":[75],"Distillation),":[76],"self-distillation":[79],"framework":[80],"transfers":[82],"model's":[84],"own":[85],"privileged":[86],"regional":[87],"its":[90],"full-image":[91],"policy.":[92],"instantiates":[94],"two":[95],"conditional":[96],"policies":[97],"MLLM:":[101],"crop-conditioned":[103],"teacher":[104,121,142],"and":[105,114,122,173],"full-image-conditioned":[107],"student.":[108],"The":[109,177],"student":[110,123],"generates":[111],"on-policy":[112],"rollouts,":[113],"minimizes":[116],"token-level":[117],"divergence":[118],"between":[119],"next-token":[124],"distributions":[125],"along":[126],"these":[127],"rollouts.":[128],"This":[129],"enables":[130],"model":[132],"internalize":[134],"benefit":[136],"of":[137],"zooming":[139],"without":[140],"external":[141],"models,":[143],"ground-truth":[144],"labels,":[145],"reward":[146],"verifiers,":[147],"or":[148,165],"inference-time":[149],"tool":[150],"use.":[151],"Experiments":[152],"multiple":[154],"understanding":[157],"benchmarks":[158],"show":[159],"models":[162],"achieve":[163],"competitive":[164],"superior":[166],"performance":[167],"against":[168],"much":[169],"larger":[170],"open-source,":[171],"closed-source,":[172],"\"Thinking-with-Images\"":[174],"agentic":[175],"models.":[176],"code":[178],"is":[179],"available":[180],"at":[181],"https://github.com/VisionOPD/Vision-OPD":[182]},"counts_by_year":[],"updated_date":"2026-07-01T08:55:40.977307","created_date":"2026-05-20T00:00:00"}
