{"id":"https://openalex.org/W7131402460","doi":"https://doi.org/10.48550/arxiv.2602.20119","title":"NovaPlan: Zero-Shot Long-Horizon Manipulation via Closed-Loop Video Language Planning","display_name":"NovaPlan: Zero-Shot Long-Horizon Manipulation via Closed-Loop Video Language Planning","publication_year":2026,"publication_date":"2026-02-23","ids":{"openalex":"https://openalex.org/W7131402460","doi":"https://doi.org/10.48550/arxiv.2602.20119"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2602.20119","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.20119","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2602.20119","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5086902028","display_name":"Jiahui Fu","orcid":"https://orcid.org/0000-0001-8298-6659"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Fu, Jiahui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064611222","display_name":"Junyu Nan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nan, Junyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126810839","display_name":"Lingfeng Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Lingfeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126779933","display_name":"Hongyu Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Hongyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059057417","display_name":"Jianing Qian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qian, Jianing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018716486","display_name":"Jennifer Barry","orcid":"https://orcid.org/0000-0001-6841-1613"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Barry, Jennifer L.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126851125","display_name":"Kris Kitani","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kitani, Kris","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126808061","display_name":"George Konidaris","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Konidaris, George","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5086902028"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7649999856948853,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7649999856948853,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.1378999948501587,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.019200000911951065,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.6697999835014343},{"id":"https://openalex.org/keywords/kinematics","display_name":"Kinematics","score":0.5356000065803528},{"id":"https://openalex.org/keywords/planner","display_name":"Planner","score":0.5235999822616577},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4659000039100647},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.45730000734329224},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.38929998874664307},{"id":"https://openalex.org/keywords/programming-by-demonstration","display_name":"Programming by demonstration","score":0.35260000824928284},{"id":"https://openalex.org/keywords/prior-probability","display_name":"Prior probability","score":0.32839998602867126},{"id":"https://openalex.org/keywords/task-analysis","display_name":"Task analysis","score":0.32519999146461487}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7736999988555908},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.6697999835014343},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5774999856948853},{"id":"https://openalex.org/C39920418","wikidata":"https://www.wikidata.org/wiki/Q11476","display_name":"Kinematics","level":2,"score":0.5356000065803528},{"id":"https://openalex.org/C2776999362","wikidata":"https://www.wikidata.org/wiki/Q2349274","display_name":"Planner","level":2,"score":0.5235999822616577},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4659000039100647},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.45730000734329224},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.43549999594688416},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3946000039577484},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.38929998874664307},{"id":"https://openalex.org/C2779038628","wikidata":"https://www.wikidata.org/wiki/Q7248497","display_name":"Programming by demonstration","level":3,"score":0.35260000824928284},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.32839998602867126},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.32519999146461487},{"id":"https://openalex.org/C74222875","wikidata":"https://www.wikidata.org/wiki/Q16000312","display_name":"Robot kinematics","level":4,"score":0.3176000118255615},{"id":"https://openalex.org/C2781089630","wikidata":"https://www.wikidata.org/wiki/Q21856745","display_name":"Realization (probability)","level":2,"score":0.30660000443458557},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.3043999969959259},{"id":"https://openalex.org/C81074085","wikidata":"https://www.wikidata.org/wiki/Q366872","display_name":"Motion planning","level":3,"score":0.29440000653266907},{"id":"https://openalex.org/C60692881","wikidata":"https://www.wikidata.org/wiki/Q584529","display_name":"Humanoid robot","level":3,"score":0.29170000553131104},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.289900004863739},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.28870001435279846},{"id":"https://openalex.org/C65401140","wikidata":"https://www.wikidata.org/wiki/Q7353385","display_name":"Robot control","level":4,"score":0.2800999879837036},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.27810001373291016},{"id":"https://openalex.org/C2776544517","wikidata":"https://www.wikidata.org/wiki/Q189447","display_name":"Unexpected events","level":2,"score":0.26899999380111694},{"id":"https://openalex.org/C145460709","wikidata":"https://www.wikidata.org/wiki/Q859951","display_name":"Human\u2013robot interaction","level":3,"score":0.26170000433921814},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.25440001487731934},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.2542000114917755},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.2542000114917755},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.25369998812675476}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2602.20119","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.20119","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2602.20119","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.20119","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Solving":[0],"long-horizon":[1,58,148],"tasks":[2,24,68,149,165],"requires":[3],"robots":[4],"to":[5,82,119],"integrate":[6],"high-level":[7],"semantic":[8],"reasoning":[9],"with":[10,51],"low-level":[11,92],"physical":[12,32],"interaction.":[13],"While":[14],"vision-language":[15],"models":[16,21],"(VLMs)":[17],"and":[18,25,48,71,97,103,114,150,166],"video":[19,49],"generation":[20],"can":[22,161],"decompose":[23],"imagine":[26],"outcomes,":[27],"they":[28],"often":[29],"lack":[30],"the":[31,61,80,111,121,142,151],"grounding":[33],"necessary":[34],"for":[35,56,127],"real-world":[36],"execution.":[37],"We":[38,140],"introduce":[39],"NovaPlan,":[40],"a":[41,64,76,116,125],"hierarchical":[42],"framework":[43],"that":[44,159],"unifies":[45],"closed-loop":[46],"VLM":[47,65],"planning":[50],"geometrically":[52],"grounded":[53],"robot":[54,73,93,128],"execution":[55,74,132],"zero-shot":[57],"manipulation.":[59],"At":[60],"high":[62],"level,":[63],"planner":[66],"decomposes":[67],"into":[69],"sub-goals":[70],"monitors":[72],"in":[75],"closed":[77],"loop,":[78],"enabling":[79],"system":[81],"recover":[83],"from":[84,110],"single-step":[85],"failures":[86],"through":[87],"autonomous":[88],"re-planning.":[89],"To":[90],"compute":[91],"actions,":[94,129],"we":[95],"extract":[96],"utilize":[98],"both":[99],"task-relevant":[100],"object":[101],"keypoints":[102],"human":[104],"hand":[105],"poses":[106],"as":[107,124],"kinematic":[108],"priors":[109],"generated":[112],"videos,":[113],"employ":[115],"switching":[117],"mechanism":[118],"choose":[120],"better":[122],"one":[123],"reference":[126],"maintaining":[130],"stable":[131],"even":[133],"under":[134],"heavy":[135],"occlusion":[136],"or":[137,176],"depth":[138],"inaccuracy.":[139],"demonstrate":[141],"effectiveness":[143],"of":[144],"NovaPlan":[145,160],"on":[146],"three":[147],"Functional":[152],"Manipulation":[153],"Benchmark":[154],"(FMB).":[155],"Our":[156],"results":[157],"show":[158],"perform":[162],"complex":[163],"assembly":[164],"exhibit":[167],"dexterous":[168],"error":[169],"recovery":[170],"behaviors":[171],"without":[172],"any":[173],"prior":[174],"demonstrations":[175],"training.":[177],"Project":[178],"page:":[179],"https://nova-plan.github.io/":[180]},"counts_by_year":[],"updated_date":"2026-02-26T06:34:08.959763","created_date":"2026-02-26T00:00:00"}
