{"id":"https://openalex.org/W4387839182","doi":"https://doi.org/10.48550/arxiv.2310.12921","title":"Vision-Language Models are Zero-Shot Reward Models for Reinforcement Learning","display_name":"Vision-Language Models are Zero-Shot Reward Models for Reinforcement Learning","publication_year":2023,"publication_date":"2023-10-19","ids":{"openalex":"https://openalex.org/W4387839182","doi":"https://doi.org/10.48550/arxiv.2310.12921"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2310.12921","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2310.12921","pdf_url":"https://arxiv.org/pdf/2310.12921","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2310.12921","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001084719","display_name":"Juan Rocamonde","orcid":"https://orcid.org/0000-0003-1253-9110"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Rocamonde, Juan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075777411","display_name":"Victoriano Montesinos","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Montesinos, Victoriano","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089475940","display_name":"Elvis Nava","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nava, Elvis","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091112967","display_name":"Ethan Perez","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Perez, Ethan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5059597480","display_name":"David Lindner","orcid":"https://orcid.org/0000-0001-7051-7433"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lindner, David","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5001084719"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":6,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.32109999656677246,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.32109999656677246,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/zero","display_name":"Zero (linguistics)","score":0.7550697326660156},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7305226922035217},{"id":"https://openalex.org/keywords/shot","display_name":"Shot (pellet)","score":0.5914591550827026},{"id":"https://openalex.org/keywords/reinforcement","display_name":"Reinforcement","score":0.43409907817840576},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.43354088068008423},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.40049242973327637},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.37130624055862427},{"id":"https://openalex.org/keywords/cognitive-psychology","display_name":"Cognitive psychology","score":0.34109044075012207},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.14144915342330933},{"id":"https://openalex.org/keywords/social-psychology","display_name":"Social psychology","score":0.13562047481536865},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.062082916498184204}],"concepts":[{"id":"https://openalex.org/C2780813799","wikidata":"https://www.wikidata.org/wiki/Q3274237","display_name":"Zero (linguistics)","level":2,"score":0.7550697326660156},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7305226922035217},{"id":"https://openalex.org/C2778344882","wikidata":"https://www.wikidata.org/wiki/Q278938","display_name":"Shot (pellet)","level":2,"score":0.5914591550827026},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.43409907817840576},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.43354088068008423},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.40049242973327637},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.37130624055862427},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.34109044075012207},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.14144915342330933},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.13562047481536865},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.062082916498184204},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2310.12921","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2310.12921","pdf_url":"https://arxiv.org/pdf/2310.12921","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2310.12921","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2310.12921","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2310.12921","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2310.12921","pdf_url":"https://arxiv.org/pdf/2310.12921","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1652710701","display_name":null,"funder_award_id":"CRSII5_","funder_id":"https://openalex.org/F4320320924","funder_display_name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung"},{"id":"https://openalex.org/G2527289611","display_name":"Ultra compact miniaturized microscopes to image meso-scale brain activity","funder_award_id":"189251","funder_id":"https://openalex.org/F4320320924","funder_display_name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung"},{"id":"https://openalex.org/G3563205607","display_name":"Temporal Information Integration in Neural Networks","funder_award_id":"173721","funder_id":"https://openalex.org/F4320320924","funder_display_name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung"},{"id":"https://openalex.org/G47709386","display_name":null,"funder_award_id":"CRSII5-173721","funder_id":"https://openalex.org/F4320320924","funder_display_name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung"},{"id":"https://openalex.org/G4966013024","display_name":null,"funder_award_id":"315230","funder_id":"https://openalex.org/F4320320924","funder_display_name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung"},{"id":"https://openalex.org/G5299779782","display_name":null,"funder_award_id":"315230_189251","funder_id":"https://openalex.org/F4320320924","funder_display_name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung"},{"id":"https://openalex.org/G5923285487","display_name":null,"funder_award_id":"CRSII5_173721","funder_id":"https://openalex.org/F4320320924","funder_display_name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung"},{"id":"https://openalex.org/G8219606064","display_name":null,"funder_award_id":"ETH-20 19-01","funder_id":"https://openalex.org/F4320320924","funder_display_name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung"},{"id":"https://openalex.org/G8619720159","display_name":"Grundlagen des einstweiligen Rechtschutzes","funder_award_id":"15230","funder_id":"https://openalex.org/F4320320924","funder_display_name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320315389","display_name":"Open Philanthropy Project","ror":"https://ror.org/004d1k391"},{"id":"https://openalex.org/F4320320924","display_name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung","ror":"https://ror.org/00yjd3n13"}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4387839182.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2074502265","https://openalex.org/W4214877189","https://openalex.org/W2773965352","https://openalex.org/W2381179799","https://openalex.org/W2980279061","https://openalex.org/W2334685461","https://openalex.org/W2920061524","https://openalex.org/W4310083477","https://openalex.org/W2328553770","https://openalex.org/W2366718574"],"abstract_inverted_index":{"Reinforcement":[0],"learning":[1,15],"(RL)":[2],"requires":[3],"either":[4],"manually":[5,87],"specifying":[6],"a":[7,16,20,33,55,78,86,100,111,139,162,243],"reward":[8,17,44,64,89,178,240],"function,":[9,90],"which":[10,26,66],"is":[11,27,226],"often":[12,28],"infeasible,":[13],"or":[14,203],"model":[18],"from":[19],"large":[21,227],"amount":[22],"of":[23,105,127,147,183,194,246],"human":[24],"feedback,":[25],"very":[29],"expensive.":[30],"We":[31,53,70,124,133,214],"study":[32],"more":[34,172,236,238],"sample-efficient":[35],"alternative:":[36],"using":[37,61],"pretrained":[38],"vision-language":[39],"models":[40,45,241],"(VLMs)":[41],"as":[42,63,92,198,221,223],"zero-shot":[43],"(RMs)":[46],"to":[47,60,76,81,153,190],"specify":[48],"tasks":[49,84],"via":[50],"natural":[51,56],"language.":[52],"propose":[54],"and":[57,97,143,157,174,237],"general":[58],"approach":[59],"VLMs":[62,169,233],"models,":[65],"we":[67,108,160,185],"call":[68],"VLM-RMs.":[69],"use":[71],"VLM-RMs":[72,184,217],"based":[73],"on":[74],"CLIP":[75,149],"train":[77],"MuJoCo":[79],"humanoid":[80],"learn":[82],"complex":[83],"without":[85],"specified":[88],"such":[91,197],"kneeling,":[93],"doing":[94],"the":[95,117,128,148,212,224],"splits,":[96],"sitting":[98],"in":[99],"lotus":[101],"position.":[102],"For":[103],"each":[104],"these":[106],"tasks,":[107],"only":[109],"provide":[110,125],"single":[112],"sentence":[113],"text":[114],"prompt":[115,122,142],"describing":[116],"desired":[118],"task":[119],"with":[120,171],"minimal":[121],"engineering.":[123],"videos":[126],"trained":[129,170],"agents":[130],"at:":[131],"https://sites.google.com/view/vlm-rm.":[132],"can":[134],"improve":[135],"performance":[136],"by":[137],"providing":[138],"second":[140],"\"baseline\"":[141],"projecting":[144],"out":[145],"parts":[146],"embedding":[150],"space":[151],"irrelevant":[152],"distinguish":[154],"between":[155],"goal":[156],"baseline.":[158],"Further,":[159],"find":[161,215],"strong":[163],"scaling":[164],"effect":[165],"for":[166,211,242],"VLM-RMs:":[167],"larger":[168],"compute":[173],"data":[175],"are":[176,187,208,218],"better":[177],"models.":[179],"The":[180],"failure":[181],"modes":[182],"encountered":[186],"all":[188],"related":[189],"known":[191],"capability":[192],"limitations":[193],"current":[195],"VLMs,":[196],"limited":[199],"spatial":[200],"reasoning":[201],"ability":[202],"visually":[204],"unrealistic":[205],"environments":[206],"that":[207,216,231],"far":[209],"off-distribution":[210],"VLM.":[213],"remarkably":[219],"robust":[220],"long":[222],"VLM":[225],"enough.":[228],"This":[229],"suggests":[230],"future":[232],"will":[234],"become":[235],"useful":[239],"wide":[244],"range":[245],"RL":[247],"applications.":[248]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":1}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2023-10-21T00:00:00"}
