{"id":"https://openalex.org/W7153217534","doi":"https://doi.org/10.48550/arxiv.2604.08509","title":"Visually-grounded Humanoid Agents","display_name":"Visually-grounded Humanoid Agents","publication_year":2026,"publication_date":"2026-04-09","ids":{"openalex":"https://openalex.org/W7153217534","doi":"https://doi.org/10.48550/arxiv.2604.08509"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.08509","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.08509","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.08509","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133324733","display_name":"Hang Ye","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ye, Hang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133371874","display_name":"Xiaoxuan Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Xiaoxuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133387695","display_name":"Fan Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Fan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000896822","display_name":"Wayne Wu","orcid":"https://orcid.org/0000-0002-1364-8151"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Wayne","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046965918","display_name":"Kwan-Yee Lin","orcid":"https://orcid.org/0000-0003-0175-6398"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Kwan-Yee","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133333564","display_name":"Yizhou Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yizhou","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5133324733"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.3896999955177307,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.3896999955177307,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.23739999532699585,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.17440000176429749,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5598000288009644},{"id":"https://openalex.org/keywords/humanoid-robot","display_name":"Humanoid robot","score":0.5507000088691711},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5335000157356262},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5221999883651733},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5203999876976013},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.4429999887943268},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.43860000371932983},{"id":"https://openalex.org/keywords/active-perception","display_name":"Active perception","score":0.42320001125335693}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6822999715805054},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.6536999940872192},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5598000288009644},{"id":"https://openalex.org/C60692881","wikidata":"https://www.wikidata.org/wiki/Q584529","display_name":"Humanoid robot","level":3,"score":0.5507000088691711},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5472999811172485},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5335000157356262},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5221999883651733},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5203999876976013},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4429999887943268},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.43860000371932983},{"id":"https://openalex.org/C2776010242","wikidata":"https://www.wikidata.org/wiki/Q4677575","display_name":"Active perception","level":3,"score":0.42320001125335693},{"id":"https://openalex.org/C2908647359","wikidata":"https://www.wikidata.org/wiki/Q2625603","display_name":"Population","level":2,"score":0.4018999934196472},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.39250001311302185},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.37310001254081726},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.3668000102043152},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.3578000068664551},{"id":"https://openalex.org/C121687571","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Activity recognition","level":2,"score":0.3544999957084656},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.35040000081062317},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.33500000834465027},{"id":"https://openalex.org/C193611912","wikidata":"https://www.wikidata.org/wiki/Q4677596","display_name":"Active vision","level":2,"score":0.3253999948501587},{"id":"https://openalex.org/C13687954","wikidata":"https://www.wikidata.org/wiki/Q4826847","display_name":"Autonomous agent","level":2,"score":0.32269999384880066},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3172999918460846},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.30489999055862427},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.29910001158714294},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.27129998803138733}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.08509","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.08509","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.08509","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.08509","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.6443856954574585,"id":"https://metadata.un.org/sdg/11","display_name":"Sustainable cities and communities"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Digital":[0],"human":[1,126,216],"generation":[2],"has":[3],"been":[4],"studied":[5],"for":[6],"decades":[7],"and":[8,49,97,122,144,155,202,207,218,225],"supports":[9],"a":[10,82,179],"wide":[11],"range":[12],"of":[13],"real-world":[14,103,116],"applications.":[15],"However,":[16],"most":[17],"existing":[18],"systems":[19],"are":[20],"passively":[21],"animated,":[22],"relying":[23],"on":[24],"privileged":[25],"state":[26],"or":[27],"scripted":[28],"control,":[29],"which":[30,158],"limits":[31],"scalability":[32],"to":[33,147,169,181],"novel":[34,53],"environments.":[35,188],"We":[36,176],"instead":[37],"ask:":[38],"how":[39],"can":[40],"digital":[41,64,215],"humans":[42,65,89],"actively":[43],"behave":[44,98],"using":[45],"only":[46],"visual":[47],"observations":[48],"specified":[50],"goals":[51],"in":[52,102,173,185],"scenes?":[54],"Achieving":[55],"this":[56,75],"would":[57],"enable":[58],"populating":[59],"any":[60],"3D":[61,104,112],"environments":[62],"with":[63,140,152],"at":[66,90,162],"scale":[67],"that":[68,87],"exhibit":[69],"spontaneous,":[70],"natural,":[71],"goal-directed":[72],"behaviors.":[73],"To":[74],"end,":[76],"we":[77],"introduce":[78,178],"Visually-grounded":[79],"Humanoid":[80],"Agents,":[81],"coupled":[83],"two-layer":[84],"(world-agent)":[85],"paradigm":[86],"replicates":[88],"multiple":[91],"levels:":[92],"they":[93],"look,":[94],"perceive,":[95],"reason,":[96],"like":[99],"real":[100],"people":[101],"scenes.":[105],"The":[106,128],"World":[107],"Layer":[108,130],"reconstructs":[109],"semantically":[110],"rich":[111],"Gaussian":[113],"scenes":[114],"from":[115],"videos":[117],"via":[118],"an":[119],"occlusion-aware":[120],"pipeline":[121],"accommodates":[123],"animatable":[124],"Gaussian-based":[125],"avatars.":[127],"Agent":[129],"transforms":[131],"these":[132],"avatars":[133],"into":[134],"autonomous":[135,195],"humanoid":[136],"agents,":[137],"equipping":[138],"them":[139,146],"first-person":[141],"RGB-D":[142],"perception":[143],"enabling":[145],"perform":[148],"accurate,":[149],"embodied":[150,221],"planning":[151,209],"spatial":[153],"awareness":[154],"iterative":[156],"reasoning,":[157],"is":[159],"then":[160],"executed":[161],"the":[163,174],"low":[164],"level":[165],"as":[166],"full-body":[167],"actions":[168],"drive":[170],"their":[171],"behaviors":[172],"scene.":[175],"further":[177],"benchmark":[180],"evaluate":[182],"humanoid-scene":[183],"interaction":[184],"diverse":[186],"reconstructed":[187],"Experiments":[189],"show":[190],"our":[191],"agents":[192],"achieve":[193],"robust":[194],"behavior,":[196],"yielding":[197],"higher":[198],"task":[199],"success":[200],"rates":[201],"fewer":[203],"collisions":[204],"than":[205],"ablations":[206],"state-of-the-art":[208],"methods.":[210],"This":[211],"work":[212],"enables":[213],"active":[214],"population":[217],"advances":[219],"human-centric":[220],"AI.":[222],"Data,":[223],"code,":[224],"models":[226],"will":[227],"be":[228],"open-sourced.":[229]},"counts_by_year":[],"updated_date":"2026-04-11T06:19:08.300824","created_date":"2026-04-11T00:00:00"}
