{"id":"https://openalex.org/W7138840518","doi":"https://doi.org/10.48550/arxiv.2603.17583","title":"Edit-As-Act: Goal-Regressive Planning for Open-Vocabulary 3D Indoor Scene Editing","display_name":"Edit-As-Act: Goal-Regressive Planning for Open-Vocabulary 3D Indoor Scene Editing","publication_year":2026,"publication_date":"2026-03-18","ids":{"openalex":"https://openalex.org/W7138840518","doi":"https://doi.org/10.48550/arxiv.2603.17583"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.17583","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.17583","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.17583","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129867047","display_name":"Seongrae Noh","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Noh, Seongrae","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125200175","display_name":"SeungWon Seo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Seo, SeungWon","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130075996","display_name":"Gyeong-Moon Park","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Park, Gyeong-Moon","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130020883","display_name":"HyeongYeop Kang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kang, HyeongYeop","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5129867047"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6570000052452087,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6570000052452087,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.052400000393390656,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.03959999978542328,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.5273000001907349},{"id":"https://openalex.org/keywords/planner","display_name":"Planner","score":0.48570001125335693},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4717000126838684},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.4650000035762787},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.45350000262260437},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.42309999465942383},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.37220001220703125},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.3666999936103821},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.36149999499320984}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8458999991416931},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.5273000001907349},{"id":"https://openalex.org/C2776999362","wikidata":"https://www.wikidata.org/wiki/Q2349274","display_name":"Planner","level":2,"score":0.48570001125335693},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4717000126838684},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4684000015258789},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.4650000035762787},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.45399999618530273},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.45350000262260437},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.42309999465942383},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.37220001220703125},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3666999936103821},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.36149999499320984},{"id":"https://openalex.org/C45357846","wikidata":"https://www.wikidata.org/wiki/Q2001982","display_name":"Notation","level":2,"score":0.35440000891685486},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.3513999879360199},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.3345000147819519},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3325999975204468},{"id":"https://openalex.org/C194995250","wikidata":"https://www.wikidata.org/wiki/Q531136","display_name":"Affordance","level":2,"score":0.31929999589920044},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.30709999799728394},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.2955000102519989},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.2939000129699707},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.29350000619888306},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.28769999742507935},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.2791999876499176},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2754000127315521},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.2718999981880188},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.25200000405311584}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.17583","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.17583","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.17583","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.17583","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Editing":[0],"a":[1,22,50,55,62,88,102,117,143],"3D":[2,99],"indoor":[3,191],"scene":[4,23,93,104,203],"from":[5,45,160],"natural":[6],"language":[7,120],"is":[8],"conceptually":[9],"straightforward":[10],"but":[11],"technically":[12],"challenging.":[13],"Existing":[14],"open-vocabulary":[15,92],"systems":[16],"often":[17],"regenerate":[18],"large":[19],"portions":[20],"of":[21,73,185],"or":[24,38],"rely":[25],"on":[26],"image-space":[27],"edits":[28],"that":[29,75,90,121,175],"disrupt":[30],"spatial":[31],"structure,":[32],"resulting":[33],"in":[34,98,115],"unintended":[35],"global":[36],"changes":[37],"physically":[39,154],"inconsistent":[40],"layouts.":[41],"These":[42],"limitations":[43],"stem":[44],"treating":[46],"editing":[47,67,94,187],"primarily":[48],"as":[49,95],"generative":[51],"task.":[52],"We":[53],"take":[54],"different":[56],"view.":[57],"A":[58,137],"user":[59],"instruction":[60,165],"defines":[61],"desired":[63],"world":[64],"state,":[65],"and":[66,105,113,127,133,142,148,153,169,202],"should":[68],"be":[69],"the":[70],"minimal":[71],"sequence":[72],"actions":[74],"makes":[76],"this":[77],"state":[78],"true":[79],"while":[80],"preserving":[81],"everything":[82],"else.":[83],"This":[84],"perspective":[85],"motivates":[86],"Edit-As-Act,":[87],"framework":[89],"performs":[91],"goal-regressive":[96],"planning":[97],"space.":[100],"Given":[101],"source":[103],"free-form":[106],"instruction,":[107],"Edit-As-Act":[108,163,193],"predicts":[109],"symbolic":[110],"goal":[111],"predicates":[112],"plans":[114],"EditLang,":[116],"PDDL-inspired":[118],"action":[119],"we":[122],"design":[123],"with":[124],"explicit":[125],"preconditions":[126],"effects":[128],"encoding":[129],"support,":[130],"contact,":[131],"collision,":[132],"other":[134],"geometric":[135],"relations.":[136],"language-driven":[138],"planner":[139],"proposes":[140],"actions,":[141],"validator":[144],"enforces":[145],"goal-directedness,":[146],"monotonicity,":[147],"physical":[149,170],"feasibility,":[150],"producing":[151],"interpretable":[152],"coherent":[155],"transformations.":[156],"By":[157],"separating":[158],"reasoning":[159],"low-level":[161],"generation,":[162],"achieves":[164],"fidelity,":[166],"semantic":[167],"consistency,":[168],"plausibility":[171],"-":[172],"three":[173],"criteria":[174],"existing":[176],"paradigms":[177],"cannot":[178],"satisfy":[179],"together.":[180],"On":[181],"E2A-Bench,":[182],"our":[183],"benchmark":[184],"63":[186],"tasks":[188],"across":[189,198],"9":[190],"environments,":[192],"significantly":[194],"outperforms":[195],"prior":[196],"approaches":[197],"all":[199],"edit":[200],"types":[201],"categories.":[204]},"counts_by_year":[],"updated_date":"2026-03-20T20:54:20.808490","created_date":"2026-03-20T00:00:00"}
