{"id":"https://openalex.org/W7164835370","doi":"https://doi.org/10.1145/3805622.3810797","title":"OOWM: Structuring Embodied Reasoning and Planning via Object-Oriented Programmatic World Modeling","display_name":"OOWM: Structuring Embodied Reasoning and Planning via Object-Oriented Programmatic World Modeling","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164835370","doi":"https://doi.org/10.1145/3805622.3810797"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810797","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810797","pdf_url":null,"source":null,"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810797","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5138669987","display_name":"Hongyu Chen","orcid":"https://orcid.org/0009-0000-6762-4617"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongyu Chen","raw_affiliation_strings":["Sun Yat-sen University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0009-0000-6762-4617","affiliations":[{"raw_affiliation_string":"Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100412937","display_name":"Liang Lin","orcid":"https://orcid.org/0000-0003-2248-3755"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liang Lin","raw_affiliation_strings":["Sun Yat-sen University, Guangzhou, China; X-Era AI Lab, Guangzhou, China and Guangdong Key Laboratory of Big Data Analysis and Processing, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-2248-3755","affiliations":[{"raw_affiliation_string":"Sun Yat-sen University, Guangzhou, China; X-Era AI Lab, Guangzhou, China and Guangdong Key Laboratory of Big Data Analysis and Processing, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5052611320","display_name":"Guangrun Wang","orcid":"https://orcid.org/0000-0001-7760-1339"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangrun Wang","raw_affiliation_strings":["Sun Yat-sen University, Guangzhou, China; X-Era AI Lab, Guangzhou, China and Guangdong Key Laboratory of Big Data Analysis and Processing, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-7760-1339","affiliations":[{"raw_affiliation_string":"Sun Yat-sen University, Guangzhou, China; X-Era AI Lab, Guangzhou, China and Guangdong Key Laboratory of Big Data Analysis and Processing, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93716307,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"558","last_page":"566"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.4528999924659729,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.4528999924659729,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10906","display_name":"AI-based Problem Solving and Planning","score":0.13199999928474426,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.12070000171661377,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embodied-cognition","display_name":"Embodied cognition","score":0.5421000123023987},{"id":"https://openalex.org/keywords/abstraction","display_name":"Abstraction","score":0.527999997138977},{"id":"https://openalex.org/keywords/executable","display_name":"Executable","score":0.5094000101089478},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.424699991941452},{"id":"https://openalex.org/keywords/cognitive-robotics","display_name":"Cognitive robotics","score":0.4074999988079071},{"id":"https://openalex.org/keywords/operationalization","display_name":"Operationalization","score":0.3968000113964081},{"id":"https://openalex.org/keywords/automated-reasoning","display_name":"Automated reasoning","score":0.3919000029563904},{"id":"https://openalex.org/keywords/planner","display_name":"Planner","score":0.3831999897956848},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.37529999017715454},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.36959999799728394}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7555999755859375},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5787000060081482},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.5421000123023987},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.527999997138977},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.5094000101089478},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.424699991941452},{"id":"https://openalex.org/C192327766","wikidata":"https://www.wikidata.org/wiki/Q1038799","display_name":"Cognitive robotics","level":3,"score":0.4074999988079071},{"id":"https://openalex.org/C9354725","wikidata":"https://www.wikidata.org/wiki/Q286017","display_name":"Operationalization","level":2,"score":0.3968000113964081},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.3919000029563904},{"id":"https://openalex.org/C2776999362","wikidata":"https://www.wikidata.org/wiki/Q2349274","display_name":"Planner","level":2,"score":0.3831999897956848},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.37529999017715454},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.36959999799728394},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.35100001096725464},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3488999903202057},{"id":"https://openalex.org/C179603123","wikidata":"https://www.wikidata.org/wiki/Q1941921","display_name":"Modeling language","level":3,"score":0.34279999136924744},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3239000141620636},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3174999952316284},{"id":"https://openalex.org/C2780330621","wikidata":"https://www.wikidata.org/wiki/Q7936609","display_name":"Visual modeling","level":4,"score":0.31690001487731934},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.313400000333786},{"id":"https://openalex.org/C2776505523","wikidata":"https://www.wikidata.org/wiki/Q4785468","display_name":"Plan (archaeology)","level":2,"score":0.31049999594688416},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.2971999943256378},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.28940001130104065},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.2831999957561493},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.28299999237060547},{"id":"https://openalex.org/C103683099","wikidata":"https://www.wikidata.org/wiki/Q5370102","display_name":"Embodied agent","level":3,"score":0.2809000015258789},{"id":"https://openalex.org/C2775945657","wikidata":"https://www.wikidata.org/wiki/Q381442","display_name":"Structuring","level":2,"score":0.2759999930858612},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2741999924182892},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.27379998564720154},{"id":"https://openalex.org/C202446494","wikidata":"https://www.wikidata.org/wiki/Q664166","display_name":"Class diagram","level":4,"score":0.2624000012874603},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.257999986410141},{"id":"https://openalex.org/C167822520","wikidata":"https://www.wikidata.org/wiki/Q176452","display_name":"Finite-state machine","level":2,"score":0.257999986410141},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.25769999623298645},{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.257099986076355},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.2547000050544739}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810797","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810797","pdf_url":null,"source":null,"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810797","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810797","pdf_url":null,"source":null,"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W2013909059","https://openalex.org/W2152161678","https://openalex.org/W4385430679","https://openalex.org/W4385570088","https://openalex.org/W4389519818","https://openalex.org/W4393160302","https://openalex.org/W4401042867","https://openalex.org/W4402354102","https://openalex.org/W4402671211","https://openalex.org/W4414079054","https://openalex.org/W7133212131","https://openalex.org/W7138128974"],"related_works":[],"abstract_inverted_index":{"Standard":[0],"Chain-of-Thought":[1],"(CoT)":[2],"prompting":[3],"empowers":[4],"Large":[5],"Language":[6,134],"Models":[7],"(LLMs)":[8],"with":[9,112,173,201],"reasoning":[10,66,195],"capabilities,":[11],"yet":[12],"its":[13],"reliance":[14],"on":[15,206],"linear":[16],"natural":[17],"language":[18],"is":[19],"inherently":[20],"insufficient":[21],"for":[22,46,230],"effective":[23,198],"world":[24,77],"modeling":[25],"in":[26,218],"embodied":[27,65,232],"tasks.":[28],"While":[29],"text":[30],"offers":[31],"flexibility,":[32],"it":[33,140],"fails":[34],"to":[35,136,144,155,189],"explicitly":[36],"represent":[37],"the":[38,68,76,106,118,131,186,192,207],"state-space,":[39],"object":[40,150],"hierarchies,":[41,151],"and":[42,152,223],"causal":[43],"dependencies":[44],"required":[45],"robust":[47],"robotic":[48],"planning.":[49],"To":[50],"address":[51],"these":[52],"limitations,":[53],"we":[54,163],"propose":[55],"Object-Oriented":[56],"World":[57],"Modeling":[58,133],"(OOWM),":[59],"a":[60,81,101,113,165,227],"novel":[61],"framework":[62],"that":[63,211],"structures":[64],"through":[67],"lens":[69],"of":[70],"software":[71],"engineering":[72],"formalisms.":[73],"We":[74],"redefine":[75],"model":[78],"not":[79],"as":[80,86],"latent":[82],"vector":[83],"space,":[84],"but":[85],"an":[87],"explicit":[88],"symbolic":[89],"tuple":[90],"\\(\\mathcal":[91,109,121],"{W}":[92],"=":[93],"\\langle":[94],"\\mathcal":[95,97],"{S},":[96],"{T}":[98],"\\rangle\\)":[99],":":[100],"State":[102],"Abstraction":[103],"(\\(G_\\text{state}\\))":[104],"instantiating":[105],"environmental":[107],"state":[108],"{S}\\),":[110],"coupled":[111],"Control":[114],"Policy":[115,176],"(\\(G_\\text{control}\\))":[116],"representing":[117],"transition":[119],"logic":[120],"{T}:":[122],"S":[123],"\\times":[124],"A":[125],"\\rightarrow":[126],"S^{\\prime":[127],"}\\).":[128],"OOWM":[129,212],"leverages":[130],"Unified":[132],"(UML)":[135],"materialize":[137],"this":[138,180],"definition:":[139],"employs":[141],"Class":[142],"Diagrams":[143,154],"ground":[145],"visual":[146],"perception":[147],"into":[148,158],"rigorous":[149],"Activity":[153],"operationalize":[156],"planning":[157,219],"executable":[159],"control":[160],"flows.":[161],"Furthermore,":[162],"introduce":[164],"three-stage":[166],"training":[167],"pipeline":[168],"combining":[169],"Supervised":[170],"Fine-Tuning":[171],"(SFT)":[172],"Group":[174],"Relative":[175],"Optimization":[177],"(GRPO).":[178],"Crucially,":[179],"method":[181],"utilizes":[182],"outcome-based":[183],"rewards":[184],"from":[185],"final":[187],"plan":[188],"implicitly":[190],"optimize":[191],"underlying":[193],"object-oriented":[194],"structure,":[196],"enabling":[197],"learning":[199],"even":[200],"sparse":[202],"annotations.":[203],"Extensive":[204],"evaluations":[205],"MRoom-30k":[208],"benchmark":[209],"demonstrate":[210],"significantly":[213],"outperforms":[214],"unstructured":[215],"textual":[216],"baselines":[217],"coherence,":[220],"execution":[221],"success,":[222],"structural":[224],"fidelity,":[225],"establishing":[226],"new":[228],"paradigm":[229],"structured":[231],"reasoning.":[233]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
