{"id":"https://openalex.org/W4402753568","doi":"https://doi.org/10.1109/cvpr52733.2024.01327","title":"Generate Subgoal Images Before Act: Unlocking the Chain-of-Thought Reasoning in Diffusion Model for Robot Manipulation with Multimodal Prompts","display_name":"Generate Subgoal Images Before Act: Unlocking the Chain-of-Thought Reasoning in Diffusion Model for Robot Manipulation with Multimodal Prompts","publication_year":2024,"publication_date":"2024-06-16","ids":{"openalex":"https://openalex.org/W4402753568","doi":"https://doi.org/10.1109/cvpr52733.2024.01327"},"language":"en","primary_location":{"id":"doi:10.1109/cvpr52733.2024.01327","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52733.2024.01327","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5031772008","display_name":"Fei Ni","orcid":"https://orcid.org/0000-0002-7976-6855"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Fei Ni","raw_affiliation_strings":["Tianjin University,China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047509839","display_name":"Jianye Hao","orcid":"https://orcid.org/0000-0002-0422-8235"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianye Hao","raw_affiliation_strings":["Tianjin University,China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046786519","display_name":"Shiguang Wu","orcid":"https://orcid.org/0009-0002-8203-2761"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shiguang Wu","raw_affiliation_strings":["Huawei Noah&#x0027;s Ark Lab,China"],"affiliations":[{"raw_affiliation_string":"Huawei Noah&#x0027;s Ark Lab,China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113397231","display_name":"Longxin Kou","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Longxin Kou","raw_affiliation_strings":["Tianjin University,China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052599241","display_name":"Jiashun Liu","orcid":"https://orcid.org/0000-0002-7512-5188"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiashun Liu","raw_affiliation_strings":["Tianjin University,China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018361974","display_name":"Yan Zheng","orcid":"https://orcid.org/0000-0003-1788-9179"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yan Zheng","raw_affiliation_strings":["Tianjin University,China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115602018","display_name":"Bin Wang","orcid":"https://orcid.org/0000-0001-7665-6290"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bin Wang","raw_affiliation_strings":["Huawei Noah&#x0027;s Ark Lab,China"],"affiliations":[{"raw_affiliation_string":"Huawei Noah&#x0027;s Ark Lab,China","institution_ids":["https://openalex.org/I2250955327"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5011036128","display_name":"Yuzheng Zhuang","orcid":"https://orcid.org/0000-0002-0915-0254"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuzheng Zhuang","raw_affiliation_strings":["Huawei Noah&#x0027;s Ark Lab,China"],"affiliations":[{"raw_affiliation_string":"Huawei Noah&#x0027;s Ark Lab,China","institution_ids":["https://openalex.org/I2250955327"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5031772008"],"corresponding_institution_ids":["https://openalex.org/I162868743"],"apc_list":null,"apc_paid":null,"fwci":1.1586,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.81840684,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"13991","last_page":"14000"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.9854000210762024,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.9854000210762024,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.9851999878883362,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9642999768257141,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5823490619659424},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.5817379951477051},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4974048435688019},{"id":"https://openalex.org/keywords/chain","display_name":"Chain (unit)","score":0.45193326473236084},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.38280975818634033},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3696678876876831},{"id":"https://openalex.org/keywords/cognitive-science","display_name":"Cognitive science","score":0.35780835151672363},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.2715988755226135}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5823490619659424},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.5817379951477051},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4974048435688019},{"id":"https://openalex.org/C199185054","wikidata":"https://www.wikidata.org/wiki/Q552299","display_name":"Chain (unit)","level":2,"score":0.45193326473236084},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.38280975818634033},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3696678876876831},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.35780835151672363},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.2715988755226135},{"id":"https://openalex.org/C1276947","wikidata":"https://www.wikidata.org/wiki/Q333","display_name":"Astronomy","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cvpr52733.2024.01327","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52733.2024.01327","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":51,"referenced_works":["https://openalex.org/W2908510526","https://openalex.org/W2962800603","https://openalex.org/W2963150697","https://openalex.org/W2963937837","https://openalex.org/W2990138404","https://openalex.org/W3007769740","https://openalex.org/W3036167779","https://openalex.org/W3153469116","https://openalex.org/W3172364764","https://openalex.org/W4207072548","https://openalex.org/W4221143046","https://openalex.org/W4221159977","https://openalex.org/W4224035735","https://openalex.org/W4224912544","https://openalex.org/W4225323055","https://openalex.org/W4226125322","https://openalex.org/W4226278401","https://openalex.org/W4281485151","https://openalex.org/W4285218426","https://openalex.org/W4286892945","https://openalex.org/W4288099666","https://openalex.org/W4301206121","https://openalex.org/W4302010007","https://openalex.org/W4303648971","https://openalex.org/W4308167501","https://openalex.org/W4310625358","https://openalex.org/W4312933868","https://openalex.org/W4323572061","https://openalex.org/W4324319985","https://openalex.org/W4366330503","https://openalex.org/W4378465306","https://openalex.org/W4378473834","https://openalex.org/W4379260839","https://openalex.org/W4384264726","https://openalex.org/W4385245566","https://openalex.org/W4385403811","https://openalex.org/W4385430679","https://openalex.org/W4385473486","https://openalex.org/W4386076215","https://openalex.org/W4387725599","https://openalex.org/W4390872325","https://openalex.org/W6739901393","https://openalex.org/W6765779288","https://openalex.org/W6779823529","https://openalex.org/W6783713337","https://openalex.org/W6785308759","https://openalex.org/W6790830454","https://openalex.org/W6791353385","https://openalex.org/W6795288823","https://openalex.org/W6810334672","https://openalex.org/W6810738896"],"related_works":["https://openalex.org/W8302103","https://openalex.org/W3171631314","https://openalex.org/W2674584172","https://openalex.org/W2967743314","https://openalex.org/W1990892711","https://openalex.org/W2606825221","https://openalex.org/W2184085865","https://openalex.org/W2064245428","https://openalex.org/W2170721890","https://openalex.org/W2122871747"],"abstract_inverted_index":{"Robotics":[0],"agents":[1],"often":[2],"struggle":[3],"to":[4,18,38,59,99,157,193],"understand":[5],"and":[6,21,103,153,166,184,215],"follow":[7],"the":[8,32,61,72,101,114,130,141,160,167,194],"multi-modal":[9,68],"prompts":[10,105],"in":[11,64,78],"complex":[12],"manipulation":[13,30,66,177],"scenes":[14],"which":[15,111,200],"are":[16,217],"challenging":[17],"be":[19],"sufficiently":[20],"accurately":[22],"described":[23],"by":[24,71],"text":[25],"alone.":[26],"Moreover,":[27],"for":[28],"long-horizon":[29,65],"tasks,":[31,81],"deviation":[33],"from":[34,45],"general":[35,102],"instruction":[36,62,170],"tends":[37],"accumulate":[39],"if":[40],"lack":[41],"of":[42,75,132,145,162,169,211],"intermediate":[43],"guidance":[44],"high-level":[46,97],"subgoals.":[47],"For":[48],"this,":[49],"we":[50,53,82,149],"consider":[51],"can":[52,128],"generate":[54],"subgoal":[55,109,164,198,204],"images":[56,165],"before":[57,118],"act":[58],"enhance":[60,159],"following":[63],"with":[67],"prompts?":[69],"Inspired":[70],"great":[73],"success":[74],"diffusion":[76,93,146],"model":[77,94,117],"image":[79,205],"generation":[80,138,152],"propose":[83,150],"a":[84,96,123,136,203,208],"novel":[85],"hierarchical":[86],"framework":[87],"named":[88],"as":[89,95],"CoTDiffusion":[90,187],"that":[91,127,202],"incorporates":[92],"planner":[98],"convert":[100],"multimodal":[104],"into":[106],"coherent":[107,137],"visual":[108,180,182,185],"plans,":[110],"further":[112,158],"guide":[113],"low-level":[115],"policy":[116],"action":[119],"execution.":[120],"We":[121],"design":[122],"semantic":[124],"alignment":[125],"module":[126],"anchor":[129],"progress":[131],"generated":[133,163],"keyframes":[134],"along":[135],"chain,":[139],"unlocking":[140],"chain-of-thought":[142],"reasoning":[143],"ability":[144],"model.":[147],"Additionally,":[148],"bi-directional":[151],"frame":[154],"concat":[155],"mechanism":[156],"fidelity":[161],"accuracy":[168],"following.":[171],"The":[172,213],"experiments":[173],"cover":[174],"various":[175],"robotics":[176],"scenarios":[178],"including":[179],"reasoning,":[181],"rearrange,":[183],"constraints.":[186],"achieves":[188],"outstanding":[189],"performance":[190],"gain":[191],"compared":[192],"baselines":[195],"without":[196],"explicit":[197],"generation,":[199],"proves":[201],"is":[206],"worth":[207],"thousand":[209],"words":[210],"instruction.":[212],"details":[214],"visualizations":[216],"available":[218],"at":[219],"https://cotdiffusion.github.io.":[220]},"counts_by_year":[{"year":2025,"cited_by_count":3}],"updated_date":"2025-12-27T23:08:20.325037","created_date":"2025-10-10T00:00:00"}
