{"id":"https://openalex.org/W7128404289","doi":"https://doi.org/10.48550/arxiv.2602.06382","title":"Now You See That: Learning End-to-End Humanoid Locomotion from Raw Pixels","display_name":"Now You See That: Learning End-to-End Humanoid Locomotion from Raw Pixels","publication_year":2026,"publication_date":"2026-02-06","ids":{"openalex":"https://openalex.org/W7128404289","doi":"https://doi.org/10.48550/arxiv.2602.06382"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.06382","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125408587","display_name":"Wandong Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sun, Wandong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125429403","display_name":"Yongbo Su","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Yongbo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125390552","display_name":"Leoric Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Leoric","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040219687","display_name":"Alex Zhang","orcid":"https://orcid.org/0000-0002-6901-1993"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Alex","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125403166","display_name":"Dwyane Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Dwyane","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066642227","display_name":"Mu San","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"San, Mu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125375936","display_name":"Daniel Tian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Daniel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125381382","display_name":"Ellie Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Ellie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125404141","display_name":"Finn Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Finn","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125385094","display_name":"Ethan Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Ethan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125379245","display_name":"Zongwu Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Zongwu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5125408587"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.15729999542236328,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.15729999542236328,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10879","display_name":"Robotic Locomotion and Control","score":0.14920000731945038,"subfield":{"id":"https://openalex.org/subfields/2204","display_name":"Biomedical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.11860000342130661,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/humanoid-robot","display_name":"Humanoid robot","score":0.6302000284194946},{"id":"https://openalex.org/keywords/terrain","display_name":"Terrain","score":0.5910000205039978},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.484499990940094},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.45989999175071716},{"id":"https://openalex.org/keywords/prior-probability","display_name":"Prior probability","score":0.41609999537467957},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.4043000042438507},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3783000111579895},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.3725999891757965}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7505999803543091},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6894999742507935},{"id":"https://openalex.org/C60692881","wikidata":"https://www.wikidata.org/wiki/Q584529","display_name":"Humanoid robot","level":3,"score":0.6302000284194946},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.6105999946594238},{"id":"https://openalex.org/C161840515","wikidata":"https://www.wikidata.org/wiki/Q186131","display_name":"Terrain","level":2,"score":0.5910000205039978},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.484499990940094},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.45989999175071716},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.41609999537467957},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.4043000042438507},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3783000111579895},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.3725999891757965},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.36959999799728394},{"id":"https://openalex.org/C50637493","wikidata":"https://www.wikidata.org/wiki/Q1136781","display_name":"Morphing","level":2,"score":0.367900013923645},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.3644999861717224},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.30469998717308044},{"id":"https://openalex.org/C160633673","wikidata":"https://www.wikidata.org/wiki/Q355198","display_name":"Pixel","level":2,"score":0.2928999960422516},{"id":"https://openalex.org/C68537008","wikidata":"https://www.wikidata.org/wiki/Q247932","display_name":"Stereopsis","level":2,"score":0.2854999899864197},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.27900001406669617},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.27799999713897705},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2646999955177307},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.26269999146461487},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.2540000081062317},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2535000145435333}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.06382","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.06382","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.06382","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.06382","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","score":0.7592443227767944,"display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Achieving":[0],"robust":[1,53,151],"vision-based":[2],"humanoid":[3,50,139],"locomotion":[4],"remains":[5],"challenging":[6],"due":[7],"to":[8,100],"two":[9,138],"fundamental":[10],"issues:":[11],"the":[12,123],"sim-to-real":[13,54],"gap":[14],"introduces":[15],"significant":[16],"perception":[17],"noise":[18],"that":[19,63,83],"degrades":[20],"performance":[21,152],"on":[22,137],"fine-grained":[23,170],"tasks,":[24,91],"and":[25,68,116,126,164],"training":[26],"a":[27,58,78],"unified":[28],"policy":[29,149],"across":[30,153],"diverse":[31,154],"terrains":[32],"is":[33],"hindered":[34],"by":[35],"conflicting":[36],"learning":[37],"objectives.":[38],"To":[39],"address":[40],"these":[41],"challenges,":[42],"we":[43,56,108],"present":[44],"an":[45],"end-to-end":[46],"framework":[47],"for":[48],"vision-driven":[49],"locomotion.":[51],"For":[52,104],"transfer,":[55],"develop":[57],"high-fidelity":[59],"depth":[60,102,145],"sensor":[61],"simulation":[62],"captures":[64],"stereo":[65,144],"matching":[66],"artifacts":[67],"calibration":[69],"uncertainties":[70],"inherent":[71],"in":[72],"real-world":[73],"sensing.":[74],"We":[75,133],"further":[76],"propose":[77],"vision-aware":[79],"behavior":[80],"distillation":[81],"approach":[82,136],"combines":[84],"latent":[85],"space":[86],"alignment":[87],"with":[88,114,142],"noise-invariant":[89],"auxiliary":[90],"enabling":[92],"effective":[93],"knowledge":[94],"transfer":[95],"from":[96],"privileged":[97],"height":[98],"maps":[99],"noisy":[101],"observations.":[103],"versatile":[105],"terrain":[106,131],"adaptation,":[107],"introduce":[109],"terrain-specific":[110],"reward":[111],"shaping":[112],"integrated":[113],"multi-critic":[115],"multi-discriminator":[117],"learning,":[118],"where":[119],"dedicated":[120],"networks":[121],"capture":[122],"distinct":[124],"dynamics":[125],"motion":[127],"priors":[128],"of":[129],"each":[130],"type.":[132],"validate":[134],"our":[135],"platforms":[140,163],"equipped":[141],"different":[143],"cameras.":[146],"The":[147],"resulting":[148],"demonstrates":[150],"environments,":[155],"seamlessly":[156],"handling":[157],"extreme":[158],"challenges":[159],"such":[160],"as":[161,167,169],"high":[162],"wide":[165],"gaps,":[166],"well":[168],"tasks":[171],"including":[172],"bidirectional":[173],"long-term":[174],"staircase":[175],"traversal.":[176]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-10T00:00:00"}
