{"id":"https://openalex.org/W7151835737","doi":"https://doi.org/10.48550/arxiv.2604.04974","title":"From Video to Control: A Survey of Learning Manipulation Interfaces from Temporal Visual Data","display_name":"From Video to Control: A Survey of Learning Manipulation Interfaces from Temporal Visual Data","publication_year":2026,"publication_date":"2026-04-04","ids":{"openalex":"https://openalex.org/W7151835737","doi":"https://doi.org/10.48550/arxiv.2604.04974"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.04974","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04974","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.04974","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5071013046","display_name":"Linfang Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zheng, Linfang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059407236","display_name":"Zikai Ouyang","orcid":"https://orcid.org/0000-0002-7696-0548"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ouyang, Zikai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133176817","display_name":"Chen Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Chen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133207819","display_name":"Jia Hong Pan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Jia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133154745","display_name":"Wei Zhang","orcid":"https://orcid.org/0000-0002-3706-5989"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Wei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5071013046"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.3463999927043915,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.3463999927043915,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.14059999585151672,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.1348000019788742,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interface","display_name":"Interface (matter)","score":0.611299991607666},{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.546500027179718},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5450000166893005},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.5121999979019165},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.47540000081062317},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.4327000081539154},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.4196999967098236},{"id":"https://openalex.org/keywords/closing","display_name":"Closing (real estate)","score":0.4041000008583069}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7497000098228455},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.611299991607666},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.5717999935150146},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5575000047683716},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.546500027179718},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5450000166893005},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.5121999979019165},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.47540000081062317},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.4327000081539154},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4196999967098236},{"id":"https://openalex.org/C2778775528","wikidata":"https://www.wikidata.org/wiki/Q5135432","display_name":"Closing (real estate)","level":2,"score":0.4041000008583069},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.34940001368522644},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.3075999915599823},{"id":"https://openalex.org/C89505385","wikidata":"https://www.wikidata.org/wiki/Q47146","display_name":"User interface","level":2,"score":0.30730000138282776},{"id":"https://openalex.org/C2778751112","wikidata":"https://www.wikidata.org/wiki/Q835016","display_name":"Window (computing)","level":2,"score":0.2720000147819519},{"id":"https://openalex.org/C145460709","wikidata":"https://www.wikidata.org/wiki/Q859951","display_name":"Human\u2013robot interaction","level":3,"score":0.26100000739097595},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2590999901294708},{"id":"https://openalex.org/C138958017","wikidata":"https://www.wikidata.org/wiki/Q190087","display_name":"Data type","level":2,"score":0.2540999948978424},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.25189998745918274},{"id":"https://openalex.org/C65401140","wikidata":"https://www.wikidata.org/wiki/Q7353385","display_name":"Robot control","level":4,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.04974","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04974","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.04974","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04974","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.4059462547302246,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Video":[0],"is":[1,85,137],"a":[2,111],"scalable":[3],"observation":[4],"of":[5],"physical":[6,56],"dynamics:":[7],"it":[8,91],"captures":[9],"how":[10,13,17,134],"objects":[11],"move,":[12],"contact":[14],"unfolds,":[15],"and":[16,47,55,87,115,145,177],"scenes":[18],"evolve":[19],"under":[20],"interaction":[21],"--":[22,133,165,176],"all":[23],"without":[24],"requiring":[25],"robot":[26,50,174],"action":[27,45],"labels.":[28],"Yet":[29],"translating":[30],"this":[31,184],"temporal":[32,65,108],"structure":[33,109],"into":[34],"reliable":[35],"robotic":[36,72],"control":[37,69,89],"remains":[38],"an":[39,76],"open":[40,157],"challenge,":[41],"because":[42],"video":[43,66],"lacks":[44],"supervision":[46],"differs":[48],"from":[49],"experience":[51],"in":[52],"embodiment,":[53],"viewpoint,":[54],"constraints.":[57],"This":[58],"survey":[59],"reviews":[60],"methods":[61],"that":[62,153,168],"exploit":[63],"non-action-annotated":[64],"to":[67,172],"learn":[68],"interfaces":[70],"for":[71,123],"manipulation.":[73],"We":[74],"introduce":[75],"\\emph{interface-centric":[77],"taxonomy}":[78],"organized":[79],"by":[80],"where":[81,146],"the":[82,101,135,154,161,166],"video-to-control":[83],"interface":[84,102],"constructed":[86],"what":[88,139],"properties":[90,132],"enables,":[92],"identifying":[93],"three":[94],"families:":[95],"direct":[96],"video--action":[97],"policies,":[98],"which":[99,106,119],"keep":[100],"implicit;":[103],"latent-action":[104],"methods,":[105],"route":[107],"through":[110],"compact":[112],"learned":[113],"intermediate;":[114],"explicit":[116],"visual":[117],"interfaces,":[118],"predict":[120],"interpretable":[121],"targets":[122],"downstream":[124],"control.":[125],"For":[126],"each":[127],"family,":[128],"we":[129,178],"analyze":[130],"control-integration":[131],"loop":[136],"closed,":[138],"can":[140],"be":[141],"verified":[142],"before":[143],"execution,":[144],"failures":[147],"enter.":[148],"A":[149],"cross-family":[150],"synthesis":[151],"reveals":[152],"most":[155],"pressing":[156],"challenges":[158],"center":[159],"on":[160],"\\emph{robotics":[162],"integration":[163],"layer}":[164],"mechanisms":[167],"connect":[169],"video-derived":[170],"predictions":[171],"dependable":[173],"behavior":[175],"outline":[179],"research":[180],"directions":[181],"toward":[182],"closing":[183],"gap.":[185]},"counts_by_year":[],"updated_date":"2026-04-09T06:13:59.934233","created_date":"2026-04-09T00:00:00"}
