{"id":"https://openalex.org/W4401417048","doi":"https://doi.org/10.1109/icra57147.2024.10610090","title":"Physically Grounded Vision-Language Models for Robotic Manipulation","display_name":"Physically Grounded Vision-Language Models for Robotic Manipulation","publication_year":2024,"publication_date":"2024-05-13","ids":{"openalex":"https://openalex.org/W4401417048","doi":"https://doi.org/10.1109/icra57147.2024.10610090"},"language":"en","primary_location":{"id":"doi:10.1109/icra57147.2024.10610090","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icra57147.2024.10610090","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Robotics and Automation (ICRA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5016538750","display_name":"Jensen Gao","orcid":null},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Jensen Gao","raw_affiliation_strings":["Stanford University"],"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039641037","display_name":"Bidipta Sarkar","orcid":"https://orcid.org/0000-0002-0584-3504"},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bidipta Sarkar","raw_affiliation_strings":["Stanford University"],"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100676785","display_name":"Fei Xia","orcid":"https://orcid.org/0000-0003-4343-1444"},"institutions":[{"id":"https://openalex.org/I4210090411","display_name":"DeepMind (United Kingdom)","ror":"https://ror.org/00971b260","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210090411","https://openalex.org/I4210128969"]},{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Fei Xia","raw_affiliation_strings":["Google DeepMind"],"affiliations":[{"raw_affiliation_string":"Google DeepMind","institution_ids":["https://openalex.org/I1291425158","https://openalex.org/I4210090411"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025016495","display_name":"Ted Xiao","orcid":null},"institutions":[{"id":"https://openalex.org/I4210090411","display_name":"DeepMind (United Kingdom)","ror":"https://ror.org/00971b260","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210090411","https://openalex.org/I4210128969"]},{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Ted Xiao","raw_affiliation_strings":["Google DeepMind"],"affiliations":[{"raw_affiliation_string":"Google DeepMind","institution_ids":["https://openalex.org/I1291425158","https://openalex.org/I4210090411"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015588857","display_name":"Jia-Jun Wu","orcid":"https://orcid.org/0000-0003-4583-7691"},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jiajun Wu","raw_affiliation_strings":["Stanford University"],"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018507768","display_name":"Brian Ichter","orcid":"https://orcid.org/0000-0002-6955-6432"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]},{"id":"https://openalex.org/I4210090411","display_name":"DeepMind (United Kingdom)","ror":"https://ror.org/00971b260","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210090411","https://openalex.org/I4210128969"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Brian Ichter","raw_affiliation_strings":["Google DeepMind"],"affiliations":[{"raw_affiliation_string":"Google DeepMind","institution_ids":["https://openalex.org/I1291425158","https://openalex.org/I4210090411"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102792178","display_name":"Anirudha Majumdar","orcid":"https://orcid.org/0009-0002-2296-7485"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]},{"id":"https://openalex.org/I4210090411","display_name":"DeepMind (United Kingdom)","ror":"https://ror.org/00971b260","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210090411","https://openalex.org/I4210128969"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Anirudha Majumdar","raw_affiliation_strings":["Google DeepMind"],"affiliations":[{"raw_affiliation_string":"Google DeepMind","institution_ids":["https://openalex.org/I1291425158","https://openalex.org/I4210090411"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5080725225","display_name":"Dorsa Sadigh","orcid":"https://orcid.org/0000-0002-7802-9183"},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dorsa Sadigh","raw_affiliation_strings":["Stanford University"],"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5016538750"],"corresponding_institution_ids":["https://openalex.org/I97018004"],"apc_list":null,"apc_paid":null,"fwci":18.2282,"has_fulltext":false,"cited_by_count":75,"citation_normalized_percentile":{"value":0.99637513,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"12462","last_page":"12469"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9916999936103821,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9876000285148621,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7024863958358765},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.5190559029579163},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4797850251197815},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.468959242105484},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4188283085823059}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7024863958358765},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.5190559029579163},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4797850251197815},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.468959242105484},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4188283085823059}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icra57147.2024.10610090","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icra57147.2024.10610090","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE International Conference on Robotics and Automation (ICRA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":49,"referenced_works":["https://openalex.org/W1583837637","https://openalex.org/W2181623680","https://openalex.org/W2277195237","https://openalex.org/W2520762063","https://openalex.org/W2560730294","https://openalex.org/W2612690371","https://openalex.org/W2613576910","https://openalex.org/W2735318784","https://openalex.org/W2763110165","https://openalex.org/W2947312908","https://openalex.org/W2963294969","https://openalex.org/W3172112830","https://openalex.org/W3199693760","https://openalex.org/W4221143046","https://openalex.org/W4224912544","https://openalex.org/W4225323055","https://openalex.org/W4226278401","https://openalex.org/W4323066451","https://openalex.org/W4324321291","https://openalex.org/W4383108895","https://openalex.org/W4385565405","https://openalex.org/W4386065596","https://openalex.org/W4386065742","https://openalex.org/W4386071509","https://openalex.org/W4388720459","https://openalex.org/W4389520366","https://openalex.org/W4389665575","https://openalex.org/W6686008357","https://openalex.org/W6739585900","https://openalex.org/W6757865169","https://openalex.org/W6777014542","https://openalex.org/W6791353385","https://openalex.org/W6800875267","https://openalex.org/W6809509765","https://openalex.org/W6809646742","https://openalex.org/W6810334672","https://openalex.org/W6810738896","https://openalex.org/W6811467201","https://openalex.org/W6839632867","https://openalex.org/W6839928859","https://openalex.org/W6842585177","https://openalex.org/W6845994847","https://openalex.org/W6849177959","https://openalex.org/W6849783008","https://openalex.org/W6850495229","https://openalex.org/W6850503672","https://openalex.org/W6850553056","https://openalex.org/W6850876251","https://openalex.org/W6853116092"],"related_works":["https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2772917594","https://openalex.org/W2775347418","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,45,134],"vision-language":[3],"models":[4,23],"(VLMs)":[5],"have":[6],"led":[7],"to":[8,27,115,160],"improved":[9,147],"performance":[10,149],"on":[11,104,150,179],"tasks":[12,65,151],"such":[13,36,73],"as":[14,37],"visual":[15,126],"question":[16],"answering":[17],"and":[18,69,88,145,193,197],"image":[19],"captioning.":[20],"Consequently,":[21],"these":[22,123],"are":[24,43],"now":[25],"well-positioned":[26],"reason":[28],"about":[29,72,155],"the":[30,49,172],"physical":[31,50,70,91,110,156],"world,":[32],"particularly":[33],"within":[34],"domains":[35],"robotic":[38,63,143],"manipulation.":[39],"However,":[40],"current":[41],"VLMs":[42],"limited":[44],"their":[46,60],"understanding":[47,108],"of":[48,55,85,94,109,122,174,199],"concepts":[51,124],"(e.g.,":[52],"material,":[53],"fragility)":[54],"common":[56,95],"objects,":[57],"which":[58],"restricts":[59],"usefulness":[61],"for":[62],"manipulation":[64],"that":[66,100,152,162],"involve":[67],"interaction":[68],"reasoning":[71,154],"objects.":[74,97],"To":[75],"address":[76],"this":[77,130],"limitation,":[78],"we":[79],"propose":[80],"PHYSOBJECTS,":[81],"an":[82,135],"object-centric":[83],"dataset":[84,192],"39.6K":[86],"crowd-sourced":[87],"417K":[89],"automated":[90],"concept":[92],"annotations":[93],"household":[96],"We":[98,128,169,189],"demonstrate":[99],"fine-tuning":[101],"a":[102,139,180],"VLM":[103,133,178],"PhysObjects":[105],"improves":[106,185],"its":[107],"object":[111,157],"concepts,":[112,117,158],"including":[113],"generalization":[114],"held-out":[116],"by":[118],"capturing":[119],"human":[120],"priors":[121],"from":[125],"appearance.":[127],"incorporate":[129],"physically":[131,166,176],"grounded":[132,167,177],"interactive":[136],"framework":[137],"with":[138],"large":[140],"language":[141],"model-based":[142],"planner,":[144],"show":[146],"planning":[148],"require":[153],"compared":[159],"baselines":[161],"do":[163],"not":[164],"leverage":[165],"VLMs.":[168],"additionally":[170],"illustrate":[171],"benefits":[173],"our":[175,191,200],"real":[181],"robot,":[182],"where":[183],"it":[184],"task":[186],"success":[187],"rates.":[188],"release":[190],"provide":[194],"further":[195],"details":[196],"visualizations":[198],"results":[201],"at":[202],"https://iliad.stanford.edu/pg-vlm/.":[203]},"counts_by_year":[{"year":2026,"cited_by_count":10},{"year":2025,"cited_by_count":52},{"year":2024,"cited_by_count":13}],"updated_date":"2026-04-14T08:04:32.555800","created_date":"2025-10-10T00:00:00"}
