{"id":"https://openalex.org/W7151585725","doi":"https://doi.org/10.48550/arxiv.2604.04917","title":"Vero: An Open RL Recipe for General Visual Reasoning","display_name":"Vero: An Open RL Recipe for General Visual Reasoning","publication_year":2026,"publication_date":"2026-04-06","ids":{"openalex":"https://openalex.org/W7151585725","doi":"https://doi.org/10.48550/arxiv.2604.04917"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.04917","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04917","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.04917","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5035580849","display_name":"Gabriel Sarch","orcid":"https://orcid.org/0000-0002-4396-7612"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sarch, Gabriel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133125545","display_name":"Linrong Cai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Linrong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133113925","display_name":"Qunzhong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Qunzhong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101662710","display_name":"Qi Wu","orcid":"https://orcid.org/0000-0002-1779-8923"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Haoyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133095775","display_name":"Danqi Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Danqi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133129814","display_name":"Zhuang Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zhuang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5035580849"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.984499990940094,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.984499990940094,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0024999999441206455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.0013000000035390258,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/semantic-reasoner","display_name":"Semantic reasoner","score":0.8205999732017517},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.7405999898910522},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.6312000155448914},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.6226999759674072},{"id":"https://openalex.org/keywords/recipe","display_name":"Recipe","score":0.5062999725341797},{"id":"https://openalex.org/keywords/base","display_name":"Base (topology)","score":0.48590001463890076},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.4309000074863434},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.43059998750686646}],"concepts":[{"id":"https://openalex.org/C9616225","wikidata":"https://www.wikidata.org/wiki/Q3929429","display_name":"Semantic reasoner","level":2,"score":0.8205999732017517},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7572000026702881},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.7405999898910522},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.6312000155448914},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.6226999759674072},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6194000244140625},{"id":"https://openalex.org/C2778671685","wikidata":"https://www.wikidata.org/wiki/Q219239","display_name":"Recipe","level":2,"score":0.5062999725341797},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.48590001463890076},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.4309000074863434},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.43059998750686646},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.4074000120162964},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.40549999475479126},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.38679999113082886},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.3635999858379364},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3156000077724457},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.27230000495910645},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.2632000148296356},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2574000060558319},{"id":"https://openalex.org/C83725634","wikidata":"https://www.wikidata.org/wiki/Q7268699","display_name":"Qualitative reasoning","level":2,"score":0.25369998812675476},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.04917","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04917","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.04917","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04917","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","score":0.4723265767097473,"id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"What":[0],"does":[1],"it":[2],"take":[3],"to":[4],"build":[5],"a":[6,52,83],"visual":[7,27,67],"reasoner":[8],"that":[9,58,93,154,163,169],"works":[10],"across":[11,65,76,112,148],"charts,":[12],"science,":[13],"spatial":[14],"understanding,":[15],"and":[16,74,89,184],"open-ended":[17],"tasks?":[18],"The":[19],"strongest":[20],"vision-language":[21],"models":[22,64,106,185],"(VLMs)":[23],"show":[24],"such":[25],"broad":[26,78,170],"reasoning":[28,68,161],"is":[29,173],"within":[30],"reach,":[31],"but":[32],"the":[33,139,174],"recipe":[34],"behind":[35,40],"them":[36],"remains":[37],"unclear,":[38],"locked":[39],"proprietary":[41,133],"reinforcement":[42],"learning":[43],"(RL)":[44],"pipelines":[45],"with":[46],"non-public":[47],"data.":[48,135],"We":[49,70],"introduce":[50],"Vero,":[51],"family":[53],"of":[54,116,128,177],"fully":[55],"open":[56],"VLMs":[57],"matches":[59],"or":[60],"exceeds":[61,144],"existing":[62,145],"open-weight":[63],"diverse":[66],"tasks.":[69],"scale":[71],"RL":[72,146,179],"data":[73,171],"rewards":[75,92],"six":[77],"task":[79,149,156],"categories,":[80],"constructing":[81],"Vero-600K,":[82],"600K-sample":[84],"dataset":[85],"from":[86,121,138],"59":[87],"datasets,":[88],"designing":[90],"task-routed":[91],"handle":[94],"heterogeneous":[95],"answer":[96],"formats.":[97],"Vero":[98,123],"achieves":[99],"state-of-the-art":[100],"performance,":[101],"improving":[102],"over":[103],"four":[104],"base":[105,141],"by":[107],"3.6-5.3":[108],"points":[109],"on":[110,126],"average":[111],"VeroEval,":[113],"our":[114],"suite":[115],"30":[117,129],"challenging":[118],"benchmarks.":[119],"Starting":[120],"Qwen3-VL-8B-Instruct,":[122],"outperforms":[124],"Qwen3-VL-8B-Thinking":[125],"23":[127],"benchmarks":[130],"without":[131],"additional":[132],"thinking":[134],"When":[136],"trained":[137],"same":[140],"model,":[142],"Vero-600K":[143],"datasets":[147],"categories.":[150],"Systematic":[151],"ablations":[152],"reveal":[153],"different":[155],"categories":[157],"elicit":[158],"qualitatively":[159],"distinct":[160],"patterns":[162],"transfer":[164],"poorly":[165],"in":[166],"isolation,":[167],"suggesting":[168],"coverage":[172],"primary":[175],"driver":[176],"strong":[178],"scaling.":[180],"All":[181],"data,":[182],"code,":[183],"are":[186],"released.":[187]},"counts_by_year":[],"updated_date":"2026-04-09T06:08:40.794217","created_date":"2026-04-08T00:00:00"}
