{"id":"https://openalex.org/W7140187874","doi":"https://doi.org/10.48550/arxiv.2603.22279","title":"3D-Layout-R1: Structured Reasoning for Language-Instructed Spatial Editing","display_name":"3D-Layout-R1: Structured Reasoning for Language-Instructed Spatial Editing","publication_year":2026,"publication_date":"2026-03-23","ids":{"openalex":"https://openalex.org/W7140187874","doi":"https://doi.org/10.48550/arxiv.2603.22279"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.22279","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22279","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.22279","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zhen, Haoyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhen, Haoyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Xiaolong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xiaolong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhao, Yilin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yilin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Han","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Liu, Sifei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Sifei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Mo, Kaichun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mo, Kaichun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Gan, Chuang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gan, Chuang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Radhakrishnan, Subhashree","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Radhakrishnan, Subhashree","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.935699999332428,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.935699999332428,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.00860000029206276,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10799","display_name":"Data Visualization and Analytics","score":0.007699999958276749,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.8810999989509583},{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.6065000295639038},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.5151000022888184},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.4691999852657318},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3898000121116638},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.3840999901294708},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.3589000105857849},{"id":"https://openalex.org/keywords/spatial-analysis","display_name":"Spatial analysis","score":0.3474000096321106}],"concepts":[{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.8810999989509583},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7455999851226807},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.6065000295639038},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6000999808311462},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.5151000022888184},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.48840001225471497},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.4691999852657318},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3898000121116638},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.3840999901294708},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.3589000105857849},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.3474000096321106},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.3441999852657318},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.33559998869895935},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3244999945163727},{"id":"https://openalex.org/C179372163","wikidata":"https://www.wikidata.org/wiki/Q1406181","display_name":"Scene graph","level":3,"score":0.30160000920295715},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.29429998993873596},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.29339998960494995},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.27950000762939453},{"id":"https://openalex.org/C159032336","wikidata":"https://www.wikidata.org/wiki/Q2488768","display_name":"Non-monotonic logic","level":2,"score":0.2766999900341034},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.27619999647140503},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.2619999945163727},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.2583000063896179},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2529999911785126}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.22279","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22279","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.22279","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22279","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.7008547782897949}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1,6],"Models":[2,7],"(LLMs)":[3],"and":[4,20,48,88,108,121,134],"Vision":[5],"(VLMs)":[8],"have":[9],"shown":[10],"impressive":[11],"reasoning":[12,77],"abilities,":[13],"yet":[14],"they":[15],"struggle":[16],"with":[17],"spatial":[18,37,71,91,106,155],"understanding":[19],"layout":[21,38,101],"consistency":[22],"when":[23],"performing":[24],"fine-grained":[25],"visual":[26],"editing.":[27],"We":[28,93],"introduce":[29],"a":[30,49,98],"Structured":[31],"Reasoning":[32],"framework":[33],"that":[34,64],"performs":[35],"text-conditioned":[36],"editing":[39,102],"via":[40],"scene-graph":[41],"reasoning.":[42],"Given":[43],"an":[44,60,115],"input":[45],"scene":[46,62],"graph":[47,57,63],"natural-language":[50],"instruction,":[51],"the":[52,56,66,76],"model":[53],"reasons":[54],"over":[55,90],"to":[58,128,139,148],"generate":[59],"updated":[61],"satisfies":[65],"text":[67],"condition":[68],"while":[69],"maintaining":[70],"coherence.":[72],"By":[73],"explicitly":[74],"guiding":[75],"process":[78],"through":[79],"structured":[80],"relational":[81],"representations,":[82],"our":[83,95,143],"approach":[84],"improves":[85],"both":[86],"interpretability":[87],"control":[89],"relationships.":[92],"evaluate":[94],"method":[96],"on":[97],"new":[99],"text-guided":[100],"benchmark":[103],"encompassing":[104],"sorting,":[105],"alignment,":[107],"room-editing":[109],"tasks.":[110],"Our":[111],"training":[112],"paradigm":[113],"yields":[114],"average":[116],"15%":[117],"improvement":[118],"in":[119,124],"IoU":[120],"25%":[122],"reduction":[123],"center-distance":[125],"error":[126],"compared":[127],"Chain":[129],"of":[130],"Thought":[131],"Fine-tuning":[132],"(CoT-SFT)":[133],"vanilla":[135],"GRPO":[136],"baselines.":[137],"Compared":[138],"SOTA":[140],"zero-shot":[141],"LLMs,":[142],"best":[144],"models":[145],"achieve":[146],"up":[147],"20%":[149],"higher":[150],"mIoU,":[151],"demonstrating":[152],"markedly":[153],"improved":[154],"precision.":[156]},"counts_by_year":[],"updated_date":"2026-04-25T08:17:42.794288","created_date":"2026-03-25T00:00:00"}
