{"id":"https://openalex.org/W7133513720","doi":"https://doi.org/10.48550/arxiv.2603.02511","title":"Learning Object-Centric Spatial Reasoning for Sequential Manipulation in Cluttered Environments","display_name":"Learning Object-Centric Spatial Reasoning for Sequential Manipulation in Cluttered Environments","publication_year":2026,"publication_date":"2026-03-03","ids":{"openalex":"https://openalex.org/W7133513720","doi":"https://doi.org/10.48550/arxiv.2603.02511"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.02511","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02511","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.02511","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5018559398","display_name":"Chrisantus Eze","orcid":"https://orcid.org/0000-0001-5440-4316"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Eze, Chrisantus","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128097137","display_name":"Ryan C Julian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Julian, Ryan C","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5014357711","display_name":"Christopher Crick","orcid":"https://orcid.org/0000-0002-1635-823X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Crick, Christopher","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5018559398"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.880299985408783,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.880299985408783,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.05900000035762787,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.011099999770522118,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.6344000101089478},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6191999912261963},{"id":"https://openalex.org/keywords/heuristic","display_name":"Heuristic","score":0.5843999981880188},{"id":"https://openalex.org/keywords/workspace","display_name":"Workspace","score":0.5830000042915344},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.5360000133514404},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.5281000137329102},{"id":"https://openalex.org/keywords/modularity","display_name":"Modularity (biology)","score":0.521399974822998},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.49470001459121704},{"id":"https://openalex.org/keywords/obstacle","display_name":"Obstacle","score":0.48590001463890076},{"id":"https://openalex.org/keywords/obstacle-avoidance","display_name":"Obstacle avoidance","score":0.33799999952316284}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7979999780654907},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6675000190734863},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.6344000101089478},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6191999912261963},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.5843999981880188},{"id":"https://openalex.org/C58581272","wikidata":"https://www.wikidata.org/wiki/Q12741163","display_name":"Workspace","level":3,"score":0.5830000042915344},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.5360000133514404},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.5281000137329102},{"id":"https://openalex.org/C2779478453","wikidata":"https://www.wikidata.org/wiki/Q6889748","display_name":"Modularity (biology)","level":2,"score":0.521399974822998},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.49470001459121704},{"id":"https://openalex.org/C2776650193","wikidata":"https://www.wikidata.org/wiki/Q264661","display_name":"Obstacle","level":2,"score":0.48590001463890076},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.37130001187324524},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.362199991941452},{"id":"https://openalex.org/C6683253","wikidata":"https://www.wikidata.org/wiki/Q7075535","display_name":"Obstacle avoidance","level":4,"score":0.33799999952316284},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.32829999923706055},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.31360000371932983},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.3077999949455261},{"id":"https://openalex.org/C2779231336","wikidata":"https://www.wikidata.org/wiki/Q7534724","display_name":"Sketch","level":2,"score":0.29440000653266907},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.2930000126361847},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.2863999903202057},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.2752000093460083},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.2728999853134155},{"id":"https://openalex.org/C68537008","wikidata":"https://www.wikidata.org/wiki/Q247932","display_name":"Stereopsis","level":2,"score":0.27239999175071716},{"id":"https://openalex.org/C2776505523","wikidata":"https://www.wikidata.org/wiki/Q4785468","display_name":"Plan (archaeology)","level":2,"score":0.26460000872612},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.26159998774528503},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.25999999046325684},{"id":"https://openalex.org/C189645446","wikidata":"https://www.wikidata.org/wiki/Q350865","display_name":"Mirroring","level":2,"score":0.259799987077713},{"id":"https://openalex.org/C195818886","wikidata":"https://www.wikidata.org/wiki/Q5421724","display_name":"Expressive power","level":2,"score":0.2563000023365021},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2531000077724457},{"id":"https://openalex.org/C5339829","wikidata":"https://www.wikidata.org/wiki/Q1425977","display_name":"Machine vision","level":2,"score":0.2526000142097473},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.25110000371932983},{"id":"https://openalex.org/C2779038628","wikidata":"https://www.wikidata.org/wiki/Q7248497","display_name":"Programming by demonstration","level":3,"score":0.2508000135421753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.02511","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02511","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.02511","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02511","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.5605295896530151,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Robotic":[0],"manipulation":[1,193],"in":[2,30,104,124,134,160,170,175,179,191],"cluttered":[3],"environments":[4],"presents":[5],"a":[6,39,48,63,86,182,214],"critical":[7,75],"challenge":[8],"for":[9,27,38,77,90,184],"automation.":[10],"Recent":[11],"large-scale,":[12],"end-to-end":[13,118],"models":[14],"demonstrate":[15,93,197],"impressive":[16],"capabilities":[17],"but":[18,112],"often":[19],"lack":[20],"the":[21,73,150,158,185,199,210],"data":[22],"efficiency":[23],"and":[24,45,109,120,173,208],"modularity":[25],"required":[26],"retrieving":[28,125],"objects":[29],"dense":[31,128,161],"clutter.":[32,129,162],"In":[33],"this":[34,95],"work,":[35],"we":[36,196],"argue":[37],"paradigm":[40],"of":[41,106,187],"specialized,":[42,188],"decoupled":[43,96],"systems":[44],"present":[46],"Unveiler,":[47],"framework":[49],"that":[50,70,94,156,198],"explicitly":[51],"separates":[52],"high-level":[53],"spatial":[54,201],"reasoning":[55,190,202],"from":[56,127,139],"low-level":[57],"action":[58],"execution.":[59,91],"Unveiler's":[60],"core":[61],"is":[62,82,98,132],"lightweight,":[64],"transformer-based":[65],"Spatial":[66],"Relationship":[67],"Encoder":[68],"(SRE)":[69],"sequentially":[71],"identifies":[72],"most":[74],"obstacle":[76],"removal.":[78],"This":[79],"discrete":[80],"decision":[81],"then":[83],"passed":[84],"to":[85,152,167,205],"rotation-invariant":[87],"Action":[88],"Decoder":[89],"We":[92],"architecture":[97],"not":[99],"only":[100,218],"more":[101],"computationally":[102],"efficient":[103],"terms":[105],"parameter":[107],"count":[108],"inference":[110],"time,":[111],"also":[113],"significantly":[114],"outperforms":[115],"both":[116],"classic":[117],"policies":[119],"modern,":[121],"large-model-based":[122],"baselines":[123],"targets":[126],"The":[130],"SRE":[131],"trained":[133],"two":[135],"stages:":[136],"imitation":[137],"learning":[138],"heuristic":[140,159],"demonstrations":[141],"provides":[142],"sample-efficient":[143],"initialization,":[144],"after":[145],"which":[146],"PPO":[147],"fine-tuning":[148],"enables":[149],"policy":[151],"discover":[153],"removal":[154],"strategies":[155],"surpass":[157],"Our":[163],"results,":[164],"achieving":[165],"up":[166],"97.6\\%":[168],"success":[169],"partially":[171],"occluded":[172,177],"90.0\\%":[174],"fully":[176],"scenarios":[178],"simulation,":[180],"make":[181],"case":[183],"power":[186],"object-centric":[189],"complex":[192],"tasks.":[194],"Additionally,":[195],"SRE's":[200],"transfers":[203],"zero-shot":[204],"real":[206],"scenes,":[207],"validate":[209],"full":[211],"system":[212],"on":[213],"physical":[215],"robot":[216],"requiring":[217],"geometric":[219],"workspace":[220],"calibration;":[221],"no":[222],"learned":[223],"components":[224],"are":[225],"retrained.":[226]},"counts_by_year":[],"updated_date":"2026-03-05T07:36:02.291473","created_date":"2026-03-05T00:00:00"}
