{"id":"https://openalex.org/W7134236294","doi":"https://doi.org/10.48550/arxiv.2603.06561","title":"EgoReasoner: Learning Egocentric 4D Reasoning via Task-Adaptive Structured Thinking","display_name":"EgoReasoner: Learning Egocentric 4D Reasoning via Task-Adaptive Structured Thinking","publication_year":2026,"publication_date":"2026-03-06","ids":{"openalex":"https://openalex.org/W7134236294","doi":"https://doi.org/10.48550/arxiv.2603.06561"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.06561","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128489877","display_name":"Fangrui Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhu, Fangrui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089547220","display_name":"Yunfeng Xi","orcid":"https://orcid.org/0000-0002-6431-8996"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xi, Yunfeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077817759","display_name":"Jianmo Ni","orcid":"https://orcid.org/0000-0002-6863-8073"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ni, Jianmo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128582098","display_name":"Mu Cai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Mu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128522259","display_name":"Boqing Gong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gong, Boqing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128476708","display_name":"Long Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Long","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128388065","display_name":"Chen Qu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qu, Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059115868","display_name":"Ian Miao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Miao, Ian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128538056","display_name":"Yi Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128549605","display_name":"Cheng Zhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhong, Cheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103861361","display_name":"Huaizu Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Huaizu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5039879761","display_name":"Shwetak Patel","orcid":"https://orcid.org/0000-0002-6300-4389"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Patel, Shwetak","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5128489877"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.31610000133514404,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.31610000133514404,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.24570000171661377,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.06289999932050705,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.6507999897003174},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5953999757766724},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5507000088691711},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.5157999992370605},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.4848000109195709},{"id":"https://openalex.org/keywords/cognition","display_name":"Cognition","score":0.4456000030040741},{"id":"https://openalex.org/keywords/task-analysis","display_name":"Task analysis","score":0.41940000653266907},{"id":"https://openalex.org/keywords/structured-prediction","display_name":"Structured prediction","score":0.39480000734329224}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6837000250816345},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.6507999897003174},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6254000067710876},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5953999757766724},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5507000088691711},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.5157999992370605},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.4848000109195709},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.4456000030040741},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.41940000653266907},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.39480000734329224},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.3393000066280365},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3221000134944916},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.31470000743865967},{"id":"https://openalex.org/C170494330","wikidata":"https://www.wikidata.org/wiki/Q1778434","display_name":"Cognitive map","level":3,"score":0.3125999867916107},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2922999858856201},{"id":"https://openalex.org/C2780226923","wikidata":"https://www.wikidata.org/wiki/Q929848","display_name":"Movement (music)","level":2,"score":0.2840999960899353},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.2840000092983246},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.2720000147819519},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.27160000801086426},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26429998874664307},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.2612999975681305},{"id":"https://openalex.org/C2778662690","wikidata":"https://www.wikidata.org/wiki/Q3125339","display_name":"Spatial ability","level":3,"score":0.2590999901294708},{"id":"https://openalex.org/C40506919","wikidata":"https://www.wikidata.org/wiki/Q7452469","display_name":"Sequence learning","level":2,"score":0.25850000977516174},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.25519999861717224}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.06561","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.06561","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.06561","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.06561","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.5419799089431763,"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Egocentric":[0],"video":[1],"understanding":[2],"is":[3],"inherently":[4],"complex":[5],"due":[6],"to":[7,115,138],"the":[8,13,108,112,121,128,136,148,186],"dynamic":[9],"4D":[10,38],"nature":[11],"of":[12,25,35,130],"environment,":[14],"where":[15],"camera":[16],"motion":[17],"and":[18,52,66,86,111,159],"object":[19,48,54],"displacements":[20],"necessitate":[21],"a":[22,33,102],"continuous":[23],"re-evaluation":[24],"spatial":[26,62,94],"relations.":[27],"In":[28,120,147],"this":[29],"work,":[30],"we":[31,99],"target":[32],"suite":[34],"under-explored":[36],"egocentric":[37],"reasoning":[39,84,109,166],"tasks,":[40],"including":[41],"fixture":[42,46],"interaction":[43],"counting,":[44],"viewpoint-relative":[45],"location,":[47],"movement":[49],"itinerary":[50],"tracking,":[51,65],"stationary":[53],"localization,":[55],"that":[56,71,105,134],"require":[57],"fundamentally":[58],"different":[59],"cognitive":[60,118],"operations:":[61],"anchoring,":[63],"temporal":[64,157],"duration":[67],"reasoning.":[68],"We":[69],"observe":[70],"these":[72],"structural":[73],"differences":[74],"make":[75],"task-agnostic":[76],"approaches":[77],"insufficient:":[78],"generic":[79],"Chain-of-Thought":[80],"methods":[81],"lack":[82],"task-appropriate":[83],"primitives,":[85],"uniform":[87],"reinforcement":[88,169],"learning":[89],"actively":[90],"destabilizes":[91],"performance":[92],"on":[93,177,185],"tasks.":[95],"To":[96],"address":[97],"this,":[98],"propose":[100],"EgoReasoner,":[101],"two-stage":[103],"framework":[104],"aligns":[106],"both":[107],"scaffold":[110],"reward":[113,152],"signal":[114],"each":[116,165],"task's":[117],"structure.":[119],"first":[122],"stage,":[123,150],"Task-Adaptive":[124],"Thinking":[125],"Templates":[126],"guide":[127],"synthesis":[129],"structured":[131],"CoT":[132],"traces":[133],"teach":[135],"model":[137],"reason":[139],"adaptively":[140],"across":[141],"task":[142],"types":[143],"via":[144,168],"supervised":[145],"fine-tuning.":[146],"second":[149],"task-aware":[151],"functions":[153],"verify":[154],"entity":[155],"grounding,":[156],"alignment,":[158],"task-adaptive":[160],"logical":[161],"consistency,":[162],"selectively":[163],"strengthening":[164],"pathway":[167],"fine-tuning":[170],"with":[171],"GRPO.":[172],"Our":[173],"3B-parameter":[174],"model,":[175],"trained":[176],"only":[178],"16K":[179],"samples,":[180],"achieves":[181],"37.5%":[182],"average":[183],"accuracy":[184],"challenging":[187],"HD-EPIC":[188],"benchmark,":[189],"surpassing":[190],"Qwen2.5-VL-7B":[191],"(25.7%)":[192],"by":[193],"over":[194],"10":[195],"points.":[196]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-03-10T00:00:00"}
