{"id":"https://openalex.org/W7164831289","doi":"https://doi.org/10.1145/3805622.3810612","title":"Joint-Guided Spatial and Semantic Sensitive Diffusion Policy for Robotic Manipulation","display_name":"Joint-Guided Spatial and Semantic Sensitive Diffusion Policy for Robotic Manipulation","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164831289","doi":"https://doi.org/10.1145/3805622.3810612"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810612","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810612","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810612","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5069498483","display_name":"Hongda Zhang","orcid":"https://orcid.org/0000-0002-1559-0344"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongda Zhang","raw_affiliation_strings":["Fudan University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-1559-0344","affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091016502","display_name":"Siao Liu","orcid":"https://orcid.org/0000-0003-4285-3573"},"institutions":[{"id":"https://openalex.org/I185940356","display_name":"Soochow University","ror":"https://ror.org/05kvm7n82","country_code":"TW","type":"education","lineage":["https://openalex.org/I185940356"]},{"id":"https://openalex.org/I3923682","display_name":"Soochow University","ror":"https://ror.org/05t8y2r12","country_code":"CN","type":"education","lineage":["https://openalex.org/I3923682"]}],"countries":["CN","TW"],"is_corresponding":false,"raw_author_name":"Siao Liu","raw_affiliation_strings":["Soochow University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0003-4285-3573","affiliations":[{"raw_affiliation_string":"Soochow University, Shanghai, China","institution_ids":["https://openalex.org/I185940356","https://openalex.org/I3923682"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100330628","display_name":"Yi Liu","orcid":"https://orcid.org/0000-0003-2221-2998"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi Liu","raw_affiliation_strings":["Fudan University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0003-2221-2998","affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101918104","display_name":"Chun Ouyang","orcid":"https://orcid.org/0000-0002-6249-4005"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chun Ouyang","raw_affiliation_strings":["Fudan University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-6249-4005","affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103267605","display_name":"Zhongxue Gan","orcid":"https://orcid.org/0000-0003-2709-6148"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhongxue Gan","raw_affiliation_strings":["Fudan University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0003-2709-6148","affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.91541695,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1573","last_page":"1577"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.7279999852180481,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.7279999852180481,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.10599999874830246,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.05790000036358833,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6168000102043152},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.5756999850273132},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.5169000029563904},{"id":"https://openalex.org/keywords/rgb-color-model","display_name":"RGB color model","score":0.4555000066757202},{"id":"https://openalex.org/keywords/imitation","display_name":"Imitation","score":0.4496999979019165},{"id":"https://openalex.org/keywords/robotics","display_name":"Robotics","score":0.4083000123500824},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.37459999322891235}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7644000053405762},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6970999836921692},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6168000102043152},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.5756999850273132},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.5169000029563904},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.4555000066757202},{"id":"https://openalex.org/C126388530","wikidata":"https://www.wikidata.org/wiki/Q1131737","display_name":"Imitation","level":2,"score":0.4496999979019165},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.4083000123500824},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4025000035762787},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.384799987077713},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.37459999322891235},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.3416000008583069},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.32100000977516174},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.27619999647140503},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26339998841285706},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.2542000114917755},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.2540000081062317},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.2513999938964844}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810612","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810612","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810612","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810612","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":7,"referenced_works":["https://openalex.org/W4313156423","https://openalex.org/W4385430674","https://openalex.org/W4386071839","https://openalex.org/W4401415287","https://openalex.org/W4402727359","https://openalex.org/W4403337227","https://openalex.org/W4408352604"],"related_works":[],"abstract_inverted_index":{"Imitation":[0],"learning":[1],"has":[2],"shown":[3],"strong":[4],"potential":[5],"for":[6,51],"enabling":[7],"robots":[8],"to":[9,36,86,115],"acquire":[10],"dexterous":[11],"manipulation":[12,131],"skills":[13],"by":[14,75],"integrating":[15],"visual":[16,26],"observations":[17],"with":[18,81,119],"proprioceptive":[19,84],"states.":[20],"However,":[21,92],"common":[22],"approaches":[23],"typically":[24],"use":[25],"encoders":[27],"pretrained":[28],"in":[29,90],"computer":[30],"vision":[31],"domains,":[32],"which":[33,68],"mainly":[34],"aim":[35],"extract":[37,116],"generic":[38,73],"representations":[39,97,118],"without":[40],"emphasizing":[41],"the":[42,59],"precise":[43],"spatial":[44],"and":[45,62,72,78,83,128],"semantic":[46,79],"structures":[47],"that":[48,111,134],"are":[49],"crucial":[50],"robotic":[52,130],"manipulation.":[53,91],"In":[54],"this":[55],"work,":[56],"we":[57,104],"propose":[58],"Joint-Guided":[60,107],"Spatial":[61],"Semantic":[63],"Sensitive":[64],"Diffusion":[65],"Policy":[66],"(S3D),":[67],"effectively":[69],"fuses":[70],"structured":[71],"features":[74],"incorporating":[76,94],"depth":[77],"maps":[80],"RGB":[82],"inputs":[85],"strengthen":[87],"spatial\u2013semantic":[88],"understanding":[89],"naively":[93],"these":[95],"multimodal":[96],"inevitably":[98],"introduces":[99],"additional":[100],"computational":[101],"overhead.":[102],"Thus,":[103],"introduce":[105],"a":[106,124],"Dynamic":[108],"Attention":[109],"module":[110],"generates":[112],"joint-conditioned":[113],"queries":[114],"behavior-specific":[117],"controlled":[120],"complexity.":[121],"Experiments":[122],"across":[123],"variety":[125],"of":[126],"simulated":[127],"real-world":[129],"tasks":[132],"demonstrate":[133],"S3D":[135],"yields":[136],"consistent":[137],"performance":[138],"gains":[139],"over":[140],"state-of-the-art":[141],"methods.":[142]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
