{"id":"https://openalex.org/W4413918030","doi":"https://doi.org/10.1109/icra55743.2025.11128156","title":"KALIE: Fine-Tuning Vision-Language Models for Open-World Manipulation Without Robot Data","display_name":"KALIE: Fine-Tuning Vision-Language Models for Open-World Manipulation Without Robot Data","publication_year":2025,"publication_date":"2025-05-19","ids":{"openalex":"https://openalex.org/W4413918030","doi":"https://doi.org/10.1109/icra55743.2025.11128156"},"language":"en","primary_location":{"id":"doi:10.1109/icra55743.2025.11128156","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icra55743.2025.11128156","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5055096808","display_name":"Grace Tang","orcid":"https://orcid.org/0000-0001-6179-5343"},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Grace Tang","raw_affiliation_strings":["University of California,Berkeley"],"affiliations":[{"raw_affiliation_string":"University of California,Berkeley","institution_ids":["https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052018934","display_name":"Swetha Rajkumar","orcid":null},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Swetha Rajkumar","raw_affiliation_strings":["University of California,Berkeley"],"affiliations":[{"raw_affiliation_string":"University of California,Berkeley","institution_ids":["https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101450478","display_name":"Yifei Zhou","orcid":"https://orcid.org/0000-0003-1207-1810"},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yifei Zhou","raw_affiliation_strings":["University of California,Berkeley"],"affiliations":[{"raw_affiliation_string":"University of California,Berkeley","institution_ids":["https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064208112","display_name":"Homer Walke","orcid":null},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Homer Rich Walke","raw_affiliation_strings":["University of California,Berkeley"],"affiliations":[{"raw_affiliation_string":"University of California,Berkeley","institution_ids":["https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026322200","display_name":"Sergey Levine","orcid":"https://orcid.org/0000-0001-6764-2743"},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sergey Levine","raw_affiliation_strings":["University of California,Berkeley"],"affiliations":[{"raw_affiliation_string":"University of California,Berkeley","institution_ids":["https://openalex.org/I95457486"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5113787478","display_name":"Kuan Fang","orcid":null},"institutions":[{"id":"https://openalex.org/I205783295","display_name":"Cornell University","ror":"https://ror.org/05bnh6r87","country_code":"US","type":"education","lineage":["https://openalex.org/I205783295"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kuan Fang","raw_affiliation_strings":["Cornell University,USA"],"affiliations":[{"raw_affiliation_string":"Cornell University,USA","institution_ids":["https://openalex.org/I205783295"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5055096808"],"corresponding_institution_ids":["https://openalex.org/I95457486"],"apc_list":null,"apc_paid":null,"fwci":2.5312,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.90733075,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"9507","last_page":"9515"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9817000031471252,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7178951501846313},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.5698098540306091},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5166687965393066},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.5009515285491943},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.4251038432121277}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7178951501846313},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.5698098540306091},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5166687965393066},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5009515285491943},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4251038432121277}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icra55743.2025.11128156","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icra55743.2025.11128156","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3484629716","display_name":null,"funder_award_id":"FA9550-22-1-0273","funder_id":"https://openalex.org/F4320338279","funder_display_name":"Air Force Office of Scientific Research"},{"id":"https://openalex.org/G3921806535","display_name":null,"funder_award_id":"IIS-2246811","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320338279","display_name":"Air Force Office of Scientific Research","ror":"https://ror.org/011e9bt93"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":43,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W2201912979","https://openalex.org/W2605102758","https://openalex.org/W2962736495","https://openalex.org/W2981030070","https://openalex.org/W2985936292","https://openalex.org/W3021708257","https://openalex.org/W3110015970","https://openalex.org/W3167788848","https://openalex.org/W3204715535","https://openalex.org/W4210700398","https://openalex.org/W4287811291","https://openalex.org/W4312933868","https://openalex.org/W4313186194","https://openalex.org/W4366850747","https://openalex.org/W4367721889","https://openalex.org/W4377164418","https://openalex.org/W4382491206","https://openalex.org/W4383097638","https://openalex.org/W4384264726","https://openalex.org/W4385403849","https://openalex.org/W4385430538","https://openalex.org/W4385430679","https://openalex.org/W4385473486","https://openalex.org/W4386076371","https://openalex.org/W4386185624","https://openalex.org/W4387560542","https://openalex.org/W4387560742","https://openalex.org/W4387800098","https://openalex.org/W4388327689","https://openalex.org/W4388482029","https://openalex.org/W4389218004","https://openalex.org/W4390190098","https://openalex.org/W4390873054","https://openalex.org/W4390874575","https://openalex.org/W4391591240","https://openalex.org/W4397028714","https://openalex.org/W4402353985","https://openalex.org/W4402354047","https://openalex.org/W4402354082","https://openalex.org/W4402670718","https://openalex.org/W4402753874","https://openalex.org/W4404612908"],"related_works":["https://openalex.org/W2772917594","https://openalex.org/W2036807459","https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Building":[0],"generalist":[1],"robotic":[2,44,96],"systems":[3],"involves":[4],"effectively":[5],"endowing":[6],"robots":[7],"with":[8,83,132],"the":[9,21,58,74,89],"capabilities":[10],"to":[11,126,142],"handle":[12],"novel":[13],"objects":[14,134],"in":[15,46],"an":[16,99],"open-world":[17],"setting.":[18],"Inspired":[19],"by":[20,60,86,118],"advances":[22],"of":[23,51,73],"large":[24],"pre-trained":[25,38,145],"models,":[26],"we":[27],"propose":[28],"Keypoint":[29],"Affordance":[30],"Learning":[31],"from":[32],"Imagined":[33],"Environments":[34],"(KALIE),":[35],"which":[36],"adapts":[37],"Vision":[39],"Language":[40],"Models":[41],"(VLMs)":[42],"for":[43,91],"control":[45],"a":[47],"scalable":[48],"manner.":[49],"Instead":[50],"directly":[52],"producing":[53],"motor":[54],"commands,":[55],"KALIE":[56,104,123],"controls":[57],"robot":[59],"predicting":[61],"pointbased":[62],"affordance":[63],"representations":[64],"based":[65,111],"on":[66,80,95,112],"natural":[67],"language":[68],"instructions":[69],"and":[70],"visual":[71],"observations":[72],"scene.":[75],"The":[76],"VLM":[77],"is":[78],"trained":[79],"2D":[81],"images":[82],"affordances":[84],"labeled":[85],"humans,":[87],"bypassing":[88],"need":[90],"training":[92,109],"data":[93,101,110,115,139],"collected":[94,117],"systems.":[97],"Through":[98],"affordance-aware":[100],"synthesis":[102],"pipeline,":[103],"automatically":[105],"creates":[106],"massive":[107],"high-quality":[108],"limited":[113],"example":[114,138],"manually":[116],"humans.":[119],"We":[120],"demonstrate":[121],"that":[122],"can":[124],"learn":[125],"robustly":[127],"solve":[128],"new":[129],"manipulation":[130],"tasks":[131],"unseen":[133],"given":[135],"only":[136],"50":[137],"points.":[140],"Compared":[141],"baselines":[143],"using":[144],"VLMs,":[146],"our":[147],"approach":[148],"consistently":[149],"achieves":[150],"superior":[151],"performance.":[152]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
