{"id":"https://openalex.org/W4413926057","doi":"https://doi.org/10.1109/icra55743.2025.11128038","title":"KUDA: Keypoints to Unify Dynamics Learning and Visual Prompting for Open-Vocabulary Robotic Manipulation","display_name":"KUDA: Keypoints to Unify Dynamics Learning and Visual Prompting for Open-Vocabulary Robotic Manipulation","publication_year":2025,"publication_date":"2025-05-19","ids":{"openalex":"https://openalex.org/W4413926057","doi":"https://doi.org/10.1109/icra55743.2025.11128038"},"language":"en","primary_location":{"id":"doi:10.1109/icra55743.2025.11128038","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icra55743.2025.11128038","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101780229","display_name":"Zixian Liu","orcid":"https://orcid.org/0000-0001-6471-2128"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zixian Liu","raw_affiliation_strings":["Tsinghua University"],"affiliations":[{"raw_affiliation_string":"Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057020108","display_name":"Mingtong Zhang","orcid":"https://orcid.org/0000-0003-4438-7756"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mingtong Zhang","raw_affiliation_strings":["University of Illinois Urbana-Champaign"],"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5015873735","display_name":"Yunzhu Li","orcid":"https://orcid.org/0000-0003-3954-5465"},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yunzhu Li","raw_affiliation_strings":["Columbia University"],"affiliations":[{"raw_affiliation_string":"Columbia University","institution_ids":["https://openalex.org/I78577930"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5101780229"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.13426951,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"10561","last_page":"10569"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9794999957084656,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9794999957084656,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.9521999955177307,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6873539686203003},{"id":"https://openalex.org/keywords/dynamics","display_name":"Dynamics (music)","score":0.5961195230484009},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.5662652254104614},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5086110830307007},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.3760022521018982},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.37144386768341064},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.10629794001579285},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.07858103513717651},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.05867817997932434}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6873539686203003},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.5961195230484009},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.5662652254104614},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5086110830307007},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3760022521018982},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.37144386768341064},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.10629794001579285},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.07858103513717651},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.05867817997932434},{"id":"https://openalex.org/C19417346","wikidata":"https://www.wikidata.org/wiki/Q7922","display_name":"Pedagogy","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icra55743.2025.11128038","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icra55743.2025.11128038","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W100554921","https://openalex.org/W2236233024","https://openalex.org/W2296135247","https://openalex.org/W2528489519","https://openalex.org/W3100909505","https://openalex.org/W3147794100","https://openalex.org/W4283788970","https://openalex.org/W4313026212","https://openalex.org/W4383108457","https://openalex.org/W4385430679","https://openalex.org/W4389519952","https://openalex.org/W4390874575","https://openalex.org/W4394828156","https://openalex.org/W4401417251","https://openalex.org/W4402354047","https://openalex.org/W4402716166","https://openalex.org/W4402727137","https://openalex.org/W4404612908","https://openalex.org/W4405786528"],"related_works":["https://openalex.org/W2772917594","https://openalex.org/W2036807459","https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"With":[0],"the":[1,29,105,110,163],"rapid":[2],"advancement":[3],"of":[4,31,144,165],"large":[5],"language":[6,95,149],"models":[7,11],"(LLMs)":[8],"and":[9,56,64,83,97,108,157],"vision-language":[10],"(VLMs),":[12],"significant":[13],"progress":[14],"has":[15],"been":[16],"made":[17],"in":[18],"developing":[19],"open-vocabulary":[20,49],"robotic":[21,136],"manipulation":[22,50,145],"systems.":[23],"However,":[24],"many":[25],"existing":[26],"approaches":[27],"overlook":[28],"importance":[30],"object":[32,153],"dynamics,":[33],"limiting":[34],"their":[35],"applicability":[36],"to":[37,104,112,134],"more":[38],"complex,":[39],"dynamic":[40],"tasks.":[41],"In":[42],"this":[43],"work,":[44],"we":[45],"introduce":[46],"KUDA,":[47],"an":[48],"system":[51],"that":[52,73],"integrates":[53],"dynamics":[54,67,132],"learning":[55],"visual":[57,98],"prompting":[58],"through":[59],"keypoints,":[60],"leveraging":[61],"both":[62],"VLMs":[63,82],"learning-based":[65],"neural":[66],"models.":[68],"Our":[69],"key":[70],"insight":[71],"is":[72,78,171],"a":[74,130,142],"keypoint-based":[75,118],"target":[76,114],"specification":[77],"simultaneously":[79],"interpretable":[80],"by":[81],"can":[84],"be":[85],"efficiently":[86],"translated":[87],"into":[88,123],"cost":[89,124],"functions":[90],"for":[91],"model-based":[92],"planning.":[93],"Given":[94],"instructions":[96,150],"observations,":[99],"KUDA":[100,140],"first":[101],"assigns":[102],"keypoints":[103],"RGB":[106],"image":[107],"queries":[109],"VLM":[111],"generate":[113],"specifications.":[115],"These":[116],"abstract":[117],"representations":[119],"are":[120,127],"then":[121],"converted":[122],"functions,":[125],"which":[126],"optimized":[128],"using":[129],"learned":[131],"model":[133],"produce":[135],"trajectories.":[137],"We":[138],"evaluate":[139],"on":[141],"range":[143],"tasks,":[146],"including":[147],"free-form":[148],"across":[151],"diverse":[152],"categories,":[154],"multi-object":[155],"interactions,":[156],"deformable":[158],"or":[159],"granular":[160],"objects,":[161],"demonstrating":[162],"effectiveness":[164],"our":[166],"framework.":[167],"The":[168],"project":[169],"page":[170],"available":[172],"at":[173],"http://kuda-dynamics.github.io.":[174]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
