{"id":"https://openalex.org/W4413926173","doi":"https://doi.org/10.1109/icra55743.2025.11128270","title":"Chain-of-Modality: Learning Manipulation Programs from Multimodal Human Videos with Vision-Language-Models","display_name":"Chain-of-Modality: Learning Manipulation Programs from Multimodal Human Videos with Vision-Language-Models","publication_year":2025,"publication_date":"2025-05-19","ids":{"openalex":"https://openalex.org/W4413926173","doi":"https://doi.org/10.1109/icra55743.2025.11128270"},"language":"en","primary_location":{"id":"doi:10.1109/icra55743.2025.11128270","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icra55743.2025.11128270","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100337591","display_name":"Chen Wang","orcid":"https://orcid.org/0000-0002-5340-9737"},"institutions":[{"id":"https://openalex.org/I4210090411","display_name":"DeepMind (United Kingdom)","ror":"https://ror.org/00971b260","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210090411","https://openalex.org/I4210128969"]},{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["GB","US"],"is_corresponding":true,"raw_author_name":"Chen Wang","raw_affiliation_strings":["Google DeepMind"],"affiliations":[{"raw_affiliation_string":"Google DeepMind","institution_ids":["https://openalex.org/I1291425158","https://openalex.org/I4210090411"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100676785","display_name":"Fei Xia","orcid":"https://orcid.org/0000-0003-4343-1444"},"institutions":[{"id":"https://openalex.org/I4210090411","display_name":"DeepMind (United Kingdom)","ror":"https://ror.org/00971b260","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210090411","https://openalex.org/I4210128969"]},{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Fei Xia","raw_affiliation_strings":["Google DeepMind"],"affiliations":[{"raw_affiliation_string":"Google DeepMind","institution_ids":["https://openalex.org/I1291425158","https://openalex.org/I4210090411"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015524477","display_name":"Wenhao Yu","orcid":"https://orcid.org/0000-0001-8263-8224"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]},{"id":"https://openalex.org/I4210090411","display_name":"DeepMind (United Kingdom)","ror":"https://ror.org/00971b260","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210090411","https://openalex.org/I4210128969"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Wenhao Yu","raw_affiliation_strings":["Google DeepMind"],"affiliations":[{"raw_affiliation_string":"Google DeepMind","institution_ids":["https://openalex.org/I1291425158","https://openalex.org/I4210090411"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080017055","display_name":"Tingnan Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210090411","display_name":"DeepMind (United Kingdom)","ror":"https://ror.org/00971b260","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210090411","https://openalex.org/I4210128969"]},{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Tingnan Zhang","raw_affiliation_strings":["Google DeepMind"],"affiliations":[{"raw_affiliation_string":"Google DeepMind","institution_ids":["https://openalex.org/I1291425158","https://openalex.org/I4210090411"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101649950","display_name":"Ruohan Zhang","orcid":"https://orcid.org/0000-0001-6681-3360"},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ruohan Zhang","raw_affiliation_strings":["Stanford University"],"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103249600","display_name":"C. Karen Liu","orcid":"https://orcid.org/0009-0004-0113-2002"},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"C. Karen Liu","raw_affiliation_strings":["Stanford University"],"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100450462","display_name":"Li Fei-Fei","orcid":"https://orcid.org/0000-0002-7481-0810"},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Li Fei-Fei","raw_affiliation_strings":["Stanford University"],"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103241086","display_name":"Jie Tan","orcid":"https://orcid.org/0000-0002-7947-0333"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]},{"id":"https://openalex.org/I4210090411","display_name":"DeepMind (United Kingdom)","ror":"https://ror.org/00971b260","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210090411","https://openalex.org/I4210128969"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Jie Tan","raw_affiliation_strings":["Google DeepMind"],"affiliations":[{"raw_affiliation_string":"Google DeepMind","institution_ids":["https://openalex.org/I1291425158","https://openalex.org/I4210090411"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5091443553","display_name":"Jacky Liang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210090411","display_name":"DeepMind (United Kingdom)","ror":"https://ror.org/00971b260","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210090411","https://openalex.org/I4210128969"]},{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Jacky Liang","raw_affiliation_strings":["Google DeepMind"],"affiliations":[{"raw_affiliation_string":"Google DeepMind","institution_ids":["https://openalex.org/I1291425158","https://openalex.org/I4210090411"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5100337591"],"corresponding_institution_ids":["https://openalex.org/I1291425158","https://openalex.org/I4210090411"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.28429186,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"6527","last_page":"6535"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5432000160217285,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5432000160217285,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.7503750324249268},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7175166606903076},{"id":"https://openalex.org/keywords/chain","display_name":"Chain (unit)","score":0.4859508275985718},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.46341556310653687},{"id":"https://openalex.org/keywords/multimodal-learning","display_name":"Multimodal learning","score":0.4556356966495514},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4467261731624603},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4346689283847809},{"id":"https://openalex.org/keywords/multimodality","display_name":"Multimodality","score":0.4123196303844452},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.08726125955581665}],"concepts":[{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.7503750324249268},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7175166606903076},{"id":"https://openalex.org/C199185054","wikidata":"https://www.wikidata.org/wiki/Q552299","display_name":"Chain (unit)","level":2,"score":0.4859508275985718},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.46341556310653687},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.4556356966495514},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4467261731624603},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4346689283847809},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.4123196303844452},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.08726125955581665},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C1276947","wikidata":"https://www.wikidata.org/wiki/Q333","display_name":"Astronomy","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icra55743.2025.11128270","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icra55743.2025.11128270","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W1483019628","https://openalex.org/W1927052826","https://openalex.org/W2583137229","https://openalex.org/W2625366777","https://openalex.org/W2769112066","https://openalex.org/W2777985721","https://openalex.org/W2810685774","https://openalex.org/W2963247196","https://openalex.org/W2963315828","https://openalex.org/W2963802910","https://openalex.org/W2979490629","https://openalex.org/W3038298277","https://openalex.org/W3106768499","https://openalex.org/W3108330043","https://openalex.org/W3122520957","https://openalex.org/W3175995235","https://openalex.org/W3205786327","https://openalex.org/W3207832698","https://openalex.org/W4221167977","https://openalex.org/W4283788863","https://openalex.org/W4312480274","https://openalex.org/W4312508181","https://openalex.org/W4383097638","https://openalex.org/W4383108457","https://openalex.org/W4386065350","https://openalex.org/W4388660746","https://openalex.org/W4393160795","https://openalex.org/W4401417162","https://openalex.org/W4402354026","https://openalex.org/W4402354080","https://openalex.org/W4402354151","https://openalex.org/W4402704593","https://openalex.org/W4402716288","https://openalex.org/W4402890475"],"related_works":["https://openalex.org/W3093803775","https://openalex.org/W4381827277","https://openalex.org/W3157841754","https://openalex.org/W4390136517","https://openalex.org/W2563212008","https://openalex.org/W4399869253","https://openalex.org/W2477990774","https://openalex.org/W3167558523","https://openalex.org/W3120825179","https://openalex.org/W2248308732"],"abstract_inverted_index":{"Learning":[0],"to":[1,55,67,74,94,128,159,164],"perform":[2,75,129],"manipulation":[3,17,62,130],"tasks":[4,18,131],"from":[5,113],"human":[6,47,61,98,137],"videos":[7,102],"is":[8],"a":[9,86,118,134,146],"promising":[10],"approach":[11],"for":[12,151],"teaching":[13],"robots.":[14],"However,":[15],"many":[16],"require":[19],"changing":[20],"control":[21,72,124,156],"parameters":[22,73,157],"during":[23],"task":[24,69,119,153,166],"execution,":[25],"such":[26,42],"as":[27,43],"force,":[28],"which":[29],"visual":[30],"data":[31,100],"alone":[32],"cannot":[33],"capture.":[34],"In":[35],"this":[36],"work,":[37],"we":[38,82],"leverage":[39],"sensing":[40],"devices":[41],"armbands":[44],"that":[45,52,89,143],"measure":[46],"muscle":[48,105],"activities":[49],"and":[50,64,71,121,155,168,175],"microphones":[51],"record":[53],"sound,":[54],"capture":[56],"the":[57,60,76],"details":[58],"in":[59,149,170],"process,":[63],"enable":[65],"robots":[66,127],"extract":[68],"plans":[70,154],"same":[77],"task.":[78],"To":[79],"achieve":[80],"this,":[81],"introduce":[83],"Chain-of-Modality":[84],"(CoM),":[85],"prompting":[87],"strategy":[88],"enables":[90],"Vision":[91],"Language":[92],"Models":[93],"reason":[95],"about":[96],"multimodal":[97,136],"demonstration":[99],"-":[101],"coupled":[103],"with":[104,161],"or":[106],"audio":[107],"signals.":[108],"By":[109],"progressively":[110],"integrating":[111],"information":[112],"each":[114],"modality,":[115],"CoM":[116,144],"refines":[117],"plan":[120],"generates":[122],"detailed":[123],"parameters,":[125],"enabling":[126],"based":[132],"on":[133],"single":[135],"video":[138],"prompt.":[139],"Our":[140],"experiments":[141],"show":[142],"delivers":[145],"threefold":[147],"improvement":[148],"accuracy":[150],"extracting":[152],"compared":[158],"baselines,":[160],"strong":[162],"generalization":[163],"new":[165],"setups":[167],"objects":[169],"real-world":[171],"robot":[172],"experiments.":[173],"Videos":[174],"code":[176],"are":[177],"available":[178],"at":[179],"chain-of-modality.github.io":[180]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
