{"id":"https://openalex.org/W7134253923","doi":"https://doi.org/10.48550/arxiv.2603.05757","title":"EmboAlign: Aligning Video Generation with Compositional Constraints for Zero-Shot Manipulation","display_name":"EmboAlign: Aligning Video Generation with Compositional Constraints for Zero-Shot Manipulation","publication_year":2026,"publication_date":"2026-03-05","ids":{"openalex":"https://openalex.org/W7134253923","doi":"https://doi.org/10.48550/arxiv.2603.05757"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.05757","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.05757","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.05757","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128470155","display_name":"Gehao Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Gehao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128278628","display_name":"Zhenyang Ni","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ni, Zhenyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113411377","display_name":"Payal Mohapatra","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mohapatra, Payal","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128587689","display_name":"Han Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Han","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128481250","display_name":"Ruohan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ruohan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128547296","display_name":"Qi Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Qi","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5128470155"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.7907999753952026,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.7907999753952026,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.03790000081062317,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.03449999913573265,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/retargeting","display_name":"Retargeting","score":0.6980999708175659},{"id":"https://openalex.org/keywords/trajectory","display_name":"Trajectory","score":0.6779000163078308},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.6460999846458435},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.515999972820282},{"id":"https://openalex.org/keywords/motion-capture","display_name":"Motion capture","score":0.5105999708175659},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5062000155448914},{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.4812999963760376},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.47360000014305115},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.451200008392334},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.4212999939918518}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7594000101089478},{"id":"https://openalex.org/C2780575108","wikidata":"https://www.wikidata.org/wiki/Q7316652","display_name":"Retargeting","level":2,"score":0.6980999708175659},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.6779000163078308},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.6460999846458435},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6086000204086304},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.515999972820282},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.511900007724762},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.5105999708175659},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5062000155448914},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.4812999963760376},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.47360000014305115},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.451200008392334},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.4212999939918518},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.40689998865127563},{"id":"https://openalex.org/C2777655017","wikidata":"https://www.wikidata.org/wiki/Q1501161","display_name":"Toolbox","level":2,"score":0.38830000162124634},{"id":"https://openalex.org/C2775960376","wikidata":"https://www.wikidata.org/wiki/Q1435859","display_name":"Grippers","level":2,"score":0.3853999972343445},{"id":"https://openalex.org/C150415221","wikidata":"https://www.wikidata.org/wiki/Q40687","display_name":"Robotic arm","level":2,"score":0.3779999911785126},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.3741999864578247},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.36230000853538513},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.3540000021457672},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.3490000069141388},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.31949999928474426},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.31619998812675476},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.3066999912261963},{"id":"https://openalex.org/C2779231336","wikidata":"https://www.wikidata.org/wiki/Q7534724","display_name":"Sketch","level":2,"score":0.3046000003814697},{"id":"https://openalex.org/C2779456664","wikidata":"https://www.wikidata.org/wiki/Q972162","display_name":"Specularity","level":3,"score":0.29750001430511475},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.2915000021457672},{"id":"https://openalex.org/C2775936607","wikidata":"https://www.wikidata.org/wiki/Q466845","display_name":"Tracking (education)","level":2,"score":0.289000004529953},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.28450000286102295},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.28049999475479126},{"id":"https://openalex.org/C39920418","wikidata":"https://www.wikidata.org/wiki/Q11476","display_name":"Kinematics","level":2,"score":0.2669000029563904},{"id":"https://openalex.org/C196467688","wikidata":"https://www.wikidata.org/wiki/Q1851985","display_name":"Telerobotics","level":4,"score":0.2590000033378601},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.25380000472068787}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.05757","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.05757","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.05757","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.05757","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Video":[0],"generative":[1],"models":[2,77],"(VGMs)":[3],"pretrained":[4],"on":[5,187],"large-scale":[6],"internet":[7],"data":[8],"can":[9,98],"produce":[10,31],"temporally":[11],"coherent":[12],"rollout":[13,140,167],"videos":[14],"that":[15,67,86,97],"capture":[16],"rich":[17],"object":[18],"dynamics,":[19],"offering":[20],"a":[21,64,89,113,118,123,146],"compelling":[22],"foundation":[23],"for":[24],"zero-shot":[25],"robotic":[26],"manipulation.":[27],"However,":[28],"VGMs":[29],"often":[30],"physically":[32,155],"implausible":[33],"rollouts,":[34],"and":[35,54,107,144,158,170],"converting":[36],"their":[37],"pixel-space":[38],"motion":[39],"into":[40],"robot":[41,173],"actions":[42],"through":[43],"geometric":[44],"retargeting":[45,182],"further":[46],"introduces":[47],"cumulative":[48],"errors":[49],"from":[50],"imperfect":[51],"depth":[52],"estimation":[53],"keypoint":[55],"tracking.":[56],"To":[57],"address":[58],"these":[59],"challenges,":[60],"we":[61],"present":[62],"\\method{},":[63],"data-free":[65],"framework":[66],"aligns":[68],"VGM":[69,149],"outputs":[70],"with":[71],"compositional":[72,126],"constraints":[73,102,127],"generated":[74],"by":[75,201],"vision-language":[76],"(VLMs)":[78],"at":[79,135],"inference":[80],"time.":[81],"The":[82],"key":[83],"insight":[84],"is":[85],"VLMs":[87],"offer":[88],"capability":[90],"complementary":[91],"to":[92,104,120,151,180],"VGMs:":[93],"structured":[94],"spatial":[95],"reasoning":[96],"identify":[99],"the":[100,105,153,165,172,176,197,205],"physical":[101],"critical":[103],"success":[106,199],"safety":[108],"of":[109,125,148],"manipulation":[110,190],"execution.":[111],"Given":[112],"language":[114],"instruction,":[115],"\\method{}":[116,186],"uses":[117,164],"VLM":[119],"automatically":[121],"extract":[122],"set":[124,179],"capturing":[128],"task-specific":[129,210],"requirements,":[130],"which":[131,142,163],"are":[132],"then":[133],"applied":[134],"two":[136],"stages:":[137],"(1)":[138],"constraint-guided":[139],"selection,":[141],"scores":[143],"filters":[145],"batch":[147],"rollouts":[150],"retain":[152],"most":[154],"plausible":[156],"candidate,":[157],"(2)":[159],"constraint-based":[160],"trajectory":[161,174],"optimization,":[162],"selected":[166],"as":[168],"initialization":[169],"refines":[171],"under":[175],"same":[177],"constraint":[178],"correct":[181],"errors.":[183],"We":[184],"evaluate":[185],"six":[188],"real-robot":[189],"tasks":[191],"requiring":[192],"precise,":[193],"constraint-sensitive":[194],"execution,":[195],"improving":[196],"overall":[198],"rate":[200],"43.3\\%":[202],"points":[203],"over":[204],"strongest":[206],"baseline":[207],"without":[208],"any":[209],"training":[211],"data.":[212]},"counts_by_year":[],"updated_date":"2026-03-10T14:13:21.323994","created_date":"2026-03-10T00:00:00"}
