{"id":"https://openalex.org/W7162565852","doi":"https://doi.org/10.1109/3dv69130.2026.00161","title":"GMT: Goal-Conditioned Multimodal Transformer for 6-DOF Object Trajectory Synthesis in 3D Scenes","display_name":"GMT: Goal-Conditioned Multimodal Transformer for 6-DOF Object Trajectory Synthesis in 3D Scenes","publication_year":2026,"publication_date":"2026-03-20","ids":{"openalex":"https://openalex.org/W7162565852","doi":"https://doi.org/10.1109/3dv69130.2026.00161"},"language":null,"primary_location":{"id":"doi:10.1109/3dv69130.2026.00161","is_oa":false,"landing_page_url":"https://doi.org/10.1109/3dv69130.2026.00161","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 International Conference on 3D Vision (3DV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5065267141","display_name":"Huajian Zeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huajian Zeng","raw_affiliation_strings":["TU M&#x00FC;nchen"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"TU M&#x00FC;nchen","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137126021","display_name":"Abhishek Saroha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abhishek Saroha","raw_affiliation_strings":["TU M&#x00FC;nchen"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"TU M&#x00FC;nchen","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137171567","display_name":"Daniel Cremers","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Daniel Cremers","raw_affiliation_strings":["TU M&#x00FC;nchen"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"TU M&#x00FC;nchen","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5137090723","display_name":"Xi Wang","orcid":"https://orcid.org/0000-0001-5173-2234"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xi Wang","raw_affiliation_strings":["TU M&#x00FC;nchen"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"TU M&#x00FC;nchen","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.83322219,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1691","last_page":"1701"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.19900000095367432,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.19900000095367432,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.13429999351501465,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.057100001722574234,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4426000118255615},{"id":"https://openalex.org/keywords/trajectory","display_name":"Trajectory","score":0.43459999561309814},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.3564999997615814},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.2752000093460083},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.22280000150203705}],"concepts":[{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.6888999938964844},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.623199999332428},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6212000250816345},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4426000118255615},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.43459999561309814},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.3564999997615814},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2752000093460083},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.24240000545978546},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.22280000150203705},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.21660000085830688}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/3dv69130.2026.00161","is_oa":false,"landing_page_url":"https://doi.org/10.1109/3dv69130.2026.00161","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 International Conference on 3D Vision (3DV)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":38,"referenced_works":["https://openalex.org/W2019965290","https://openalex.org/W2062556295","https://openalex.org/W2099893201","https://openalex.org/W2141664020","https://openalex.org/W2211722331","https://openalex.org/W2443711627","https://openalex.org/W2949924544","https://openalex.org/W2950069298","https://openalex.org/W2963184176","https://openalex.org/W2963439114","https://openalex.org/W2963727135","https://openalex.org/W3174902251","https://openalex.org/W4243385754","https://openalex.org/W4285102193","https://openalex.org/W4312517483","https://openalex.org/W4312635677","https://openalex.org/W4312637963","https://openalex.org/W4385430679","https://openalex.org/W4386076253","https://openalex.org/W4386076288","https://openalex.org/W4390872031","https://openalex.org/W4390872581","https://openalex.org/W4390873101","https://openalex.org/W4390874416","https://openalex.org/W4391305822","https://openalex.org/W4402702968","https://openalex.org/W4402754111","https://openalex.org/W4402772414","https://openalex.org/W4404439868","https://openalex.org/W4413144259","https://openalex.org/W4413145303","https://openalex.org/W4413146087","https://openalex.org/W4413147595","https://openalex.org/W4414197407","https://openalex.org/W4415799055","https://openalex.org/W7133185896","https://openalex.org/W7133199008","https://openalex.org/W7133218093"],"related_works":[],"abstract_inverted_index":{"Synthesizing":[0],"controllable":[1],"6-DOF":[2,95],"object":[3,70,83],"manipulation":[4,152],"trajectories":[5,71,92],"in":[6,138],"3D":[7,44,75,163],"environments":[8],"is":[9],"essential":[10],"for":[11,26,150],"enabling":[12],"robots":[13],"to":[14,23,49,158],"interact":[15],"with":[16],"complex":[17],"scenes,":[18],"yet":[19],"remains":[20],"challenging":[21],"due":[22],"the":[24],"need":[25],"accurate":[27],"spatial":[28,139],"reasoning,":[29],"physical":[30],"feasibility,":[31],"and":[32,54,68,85,98,109,116,126,133,141,154,161],"multimodal":[33,62],"scene":[34,52],"understanding.":[35],"Existing":[36],"approaches":[37],"often":[38],"rely":[39],"on":[40,114],"2D":[41],"or":[42],"partial":[43],"representations,":[45],"limiting":[46],"their":[47],"ability":[48],"capture":[50],"full":[51],"geometry":[53],"constraining":[55],"trajectory":[56],"precision.":[57],"We":[58],"present":[59],"GMT,":[60],"a":[61,100,147],"transformer":[63],"framework":[64],"that":[65,104,120],"generates":[66],"realistic":[67],"goal-directed":[69],"by":[72],"jointly":[73],"leveraging":[74],"bounding":[76],"box":[77],"geometry,":[78],"point":[79],"cloud":[80],"context,":[81],"semantic":[82],"categories,":[84],"target":[86],"end":[87],"poses.":[88],"The":[89],"model":[90],"represents":[91],"as":[93,131],"continuous":[94],"pose":[96],"sequences":[97],"employs":[99],"tailored":[101],"conditioning":[102],"strategy":[103],"fuses":[105],"geometric,":[106],"semantic,":[107],"contextual,":[108],"goal-oriented":[110],"information.":[111],"Extensive":[112],"experiments":[113],"synthetic":[115],"real-world":[117],"benchmarks":[118],"demonstrate":[119],"GMT":[121],"outperforms":[122],"state-of-the-art":[123],"human":[124],"motion":[125],"human-object":[127],"interaction":[128],"baselines,":[129],"such":[130],"CHOIS":[132],"GIMO,":[134],"achieving":[135],"substantial":[136],"gains":[137],"accuracy":[140],"orientation":[142],"control.":[143],"Our":[144],"method":[145],"establishes":[146],"new":[148],"benchmark":[149],"learning-based":[151],"planning":[153],"shows":[155],"strong":[156],"generalization":[157],"diverse":[159],"objects":[160],"cluttered":[162],"environments.":[164]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-28T00:00:00"}
