{"id":"https://openalex.org/W4415538226","doi":"https://doi.org/10.1145/3746027.3754850","title":"Human Motion Generation in 3D Scenes from Open-Ended Textual Instructions with MLLM Planning","display_name":"Human Motion Generation in 3D Scenes from Open-Ended Textual Instructions with MLLM Planning","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415538226","doi":"https://doi.org/10.1145/3746027.3754850"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3754850","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754850","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013270639","display_name":"Siyi Qian","orcid":null},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Siyi Qian","raw_affiliation_strings":["Peking University, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0002-2793-0067","affiliations":[{"raw_affiliation_string":"Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106660368","display_name":"Jian Fang","orcid":null},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jian Fang","raw_affiliation_strings":["Harbin Institute of Technology, Harbin, China"],"raw_orcid":"https://orcid.org/0009-0002-7018-6958","affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Harbin, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yuzhou Mao","orcid":"https://orcid.org/0009-0005-4374-9016"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuzhou Mao","raw_affiliation_strings":["Harbin Institute of Technology, Harbin, China"],"raw_orcid":"https://orcid.org/0009-0005-4374-9016","affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Harbin, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yayun Zou","orcid":"https://orcid.org/0009-0007-0517-3188"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yayun Zou","raw_affiliation_strings":["Harbin Institute of Technology, Harbin, China"],"raw_orcid":"https://orcid.org/0009-0007-0517-3188","affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Harbin, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008772211","display_name":"Wentao Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wentao Zhang","raw_affiliation_strings":["Peking University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-7532-5550","affiliations":[{"raw_affiliation_string":"Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5102788950","display_name":"Haiwei Xue","orcid":"https://orcid.org/0000-0001-7318-9682"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haiwei Xue","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-7318-9682","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5013270639"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.2874134,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"9404","last_page":"9413"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10719","display_name":"3D Shape Modeling and Analysis","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.7684999704360962},{"id":"https://openalex.org/keywords/generator","display_name":"Generator (circuit theory)","score":0.7125999927520752},{"id":"https://openalex.org/keywords/natural-language-generation","display_name":"Natural language generation","score":0.6863999962806702},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.5877000093460083},{"id":"https://openalex.org/keywords/trajectory","display_name":"Trajectory","score":0.5863999724388123},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.5022000074386597},{"id":"https://openalex.org/keywords/planner","display_name":"Planner","score":0.48890000581741333},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4715000092983246},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.4278999865055084}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8023999929428101},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.7684999704360962},{"id":"https://openalex.org/C2780992000","wikidata":"https://www.wikidata.org/wiki/Q17016113","display_name":"Generator (circuit theory)","level":3,"score":0.7125999927520752},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.6863999962806702},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6122000217437744},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.5877000093460083},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.5863999724388123},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.503000020980835},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.5022000074386597},{"id":"https://openalex.org/C2776999362","wikidata":"https://www.wikidata.org/wiki/Q2349274","display_name":"Planner","level":2,"score":0.48890000581741333},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4715000092983246},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.4278999865055084},{"id":"https://openalex.org/C81074085","wikidata":"https://www.wikidata.org/wiki/Q366872","display_name":"Motion planning","level":3,"score":0.3935000002384186},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.37599998712539673},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3594000041484833},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3490999937057495},{"id":"https://openalex.org/C60048249","wikidata":"https://www.wikidata.org/wiki/Q37437","display_name":"Syntax","level":2,"score":0.3467000126838684},{"id":"https://openalex.org/C2985684807","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Text generation","level":2,"score":0.34119999408721924},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.33959999680519104},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3361999988555908},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.33410000801086426},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.33320000767707825},{"id":"https://openalex.org/C2986578859","wikidata":"https://www.wikidata.org/wiki/Q657632","display_name":"Human motion","level":3,"score":0.32710000872612},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.2842000126838684},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.28279998898506165},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.27649998664855957},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.2734000086784363},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.27140000462532043},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.27059999108314514},{"id":"https://openalex.org/C59415355","wikidata":"https://www.wikidata.org/wiki/Q3484781","display_name":"Text simplification","level":3,"score":0.2547999918460846},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3754850","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754850","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W2594519801","https://openalex.org/W2949924544","https://openalex.org/W2971856312","https://openalex.org/W2978956737","https://openalex.org/W2982275673","https://openalex.org/W3035724791","https://openalex.org/W3095974555","https://openalex.org/W3108262631","https://openalex.org/W3153832461","https://openalex.org/W3175246890","https://openalex.org/W4214677627","https://openalex.org/W4288079574","https://openalex.org/W4312612133","https://openalex.org/W4312635677","https://openalex.org/W4386065807","https://openalex.org/W4386075980","https://openalex.org/W4386075984","https://openalex.org/W4386076288","https://openalex.org/W4393153655"],"related_works":[],"abstract_inverted_index":{"Generating":[0],"human":[1],"motion":[2,86,130,138,154],"in":[3,57,81,187],"scenes":[4],"from":[5],"text":[6,31,67],"aims":[7],"to":[8,29,46,62,71,123],"synthesize":[9],"semantically":[10],"aligned":[11,132],"and":[12,25,44,64,108,127,147,161,181],"scene-aware":[13,137],"motions.":[14],"Existing":[15],"methods":[16],"have":[17],"made":[18],"significant":[19],"progress":[20],"by":[21],"incorporating":[22],"spatial":[23],"reasoning":[24],"structured":[26],"generation":[27,171],"strategies":[28],"connect":[30],"descriptions":[32],"with":[33,77,133,144],"human-scene":[34],"interactions.":[35],"However,":[36],"they":[37],"typically":[38],"rely":[39],"on":[40],"simple":[41],"textual":[42],"inputs":[43],"struggle":[45],"comprehend":[47],"open-ended":[48,125,185],"instructions.":[49],"There":[50],"are":[51],"three":[52],"key":[53],"challenges:":[54],"(1)":[55],"difficulty":[56],"understanding":[58],"complex":[59],"instructions":[60,186],"due":[61],"limited":[63],"templated":[65],"training":[66],"annotations;":[68],"(2)":[69],"inability":[70],"generate":[72],"natural":[73,182],"motions":[74,183],"that":[75,88,176],"align":[76],"arbitrary":[78,134],"trajectories":[79],"described":[80],"text;":[82],"(3)":[83],"lack":[84],"of":[85,102],"diversity":[87],"matches":[89],"the":[90,105,109,141,170],"intended":[91],"semantics.":[92],"To":[93,152],"address":[94],"these":[95],"challenges,":[96],"we":[97,156],"propose":[98],"PSMo,":[99],"which":[100,165],"consists":[101],"two":[103],"components:":[104],"Semantic":[106,114],"Planner":[107,115],"Scene-Aware":[110,162],"Motion":[111],"Generator.":[112],"The":[113,136],"leverages":[116],"a":[117,148,158],"Multimodal":[118],"Large":[119],"Language":[120],"Model":[121],"(MLLM)":[122],"parse":[124],"instructions,":[126],"plans":[128],"fine-grained":[129],"states":[131],"trajectories.":[135],"generator":[139],"adopts":[140],"diffusion":[142],"model":[143],"trajectory":[145],"constraints":[146],"sequential":[149],"tiling":[150],"strategy.":[151],"enhance":[153],"diversity,":[155],"introduce":[157],"retrieval-augmented":[159],"strategy":[160],"Retrieval":[163],"Attention,":[164],"integrates":[166],"multi-modal":[167],"features":[168],"into":[169],"process.":[172],"Extensive":[173],"experiments":[174],"demonstrate":[175],"our":[177],"method":[178],"produces":[179],"high-quality":[180],"under":[184],"scenes.":[188]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-25T00:00:00"}
