{"id":"https://openalex.org/W7150817817","doi":"https://doi.org/10.48550/arxiv.2604.02759","title":"OMNI-PoseX: A Fast Vision Model for 6D Object Pose Estimation in Embodied Tasks","display_name":"OMNI-PoseX: A Fast Vision Model for 6D Object Pose Estimation in Embodied Tasks","publication_year":2026,"publication_date":"2026-04-03","ids":{"openalex":"https://openalex.org/W7150817817","doi":"https://doi.org/10.48550/arxiv.2604.02759"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.02759","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.02759","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.02759","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133019878","display_name":"Michael Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Michael","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124948574","display_name":"Wei Ying","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ying, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124885138","display_name":"Fangwen Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Fangwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133046825","display_name":"Shifeng Bai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Shifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5038896099","display_name":"Hanwen Kang","orcid":"https://orcid.org/0000-0001-5553-9239"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kang, Hanwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5133019878"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.9592999815940857,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.9592999815940857,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.012500000186264515,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.005200000014156103,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pose","display_name":"Pose","score":0.8766000270843506},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6484000086784363},{"id":"https://openalex.org/keywords/articulated-body-pose-estimation","display_name":"Articulated body pose estimation","score":0.6014999747276306},{"id":"https://openalex.org/keywords/3d-pose-estimation","display_name":"3D pose estimation","score":0.5892000198364258},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.43130001425743103},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.42820000648498535},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4246000051498413},{"id":"https://openalex.org/keywords/active-perception","display_name":"Active perception","score":0.41359999775886536},{"id":"https://openalex.org/keywords/cognitive-neuroscience-of-visual-object-recognition","display_name":"Cognitive neuroscience of visual object recognition","score":0.40619999170303345}],"concepts":[{"id":"https://openalex.org/C52102323","wikidata":"https://www.wikidata.org/wiki/Q1671968","display_name":"Pose","level":2,"score":0.8766000270843506},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7815999984741211},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6844000220298767},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.6567999720573425},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6484000086784363},{"id":"https://openalex.org/C22100474","wikidata":"https://www.wikidata.org/wiki/Q4800952","display_name":"Articulated body pose estimation","level":4,"score":0.6014999747276306},{"id":"https://openalex.org/C36613465","wikidata":"https://www.wikidata.org/wiki/Q4636322","display_name":"3D pose estimation","level":3,"score":0.5892000198364258},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.43130001425743103},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.42820000648498535},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4246000051498413},{"id":"https://openalex.org/C2776010242","wikidata":"https://www.wikidata.org/wiki/Q4677575","display_name":"Active perception","level":3,"score":0.41359999775886536},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.40619999170303345},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.3871999979019165},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.38040000200271606},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.3643999993801117},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.3587000072002411},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.33070001006126404},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.31839999556541443},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.3156000077724457},{"id":"https://openalex.org/C16345878","wikidata":"https://www.wikidata.org/wiki/Q107472979","display_name":"Orientation (vector space)","level":2,"score":0.3027999997138977},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.29750001430511475},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.29429998993873596},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.29249998927116394},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.2919999957084656},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.2838999927043915},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.27709999680519104},{"id":"https://openalex.org/C189645446","wikidata":"https://www.wikidata.org/wiki/Q350865","display_name":"Mirroring","level":2,"score":0.271699994802475},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.2662999927997589},{"id":"https://openalex.org/C5339829","wikidata":"https://www.wikidata.org/wiki/Q1425977","display_name":"Machine vision","level":2,"score":0.26109999418258667}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.02759","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.02759","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.02759","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.02759","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Accurate":[0],"6D":[1,94,108],"object":[2,113],"pose":[3,63,95,109,125,131,151],"estimation":[4],"is":[5,104],"a":[6,44,50,76,122],"fundamental":[7],"capability":[8],"for":[9],"embodied":[10],"agents,":[11],"yet":[12],"remains":[13],"highly":[14],"challenging":[15],"in":[16,38],"open-world":[17,124],"environments.":[18],"Many":[19],"existing":[20],"methods":[21],"often":[22],"rely":[23],"on":[24,86,106],"closed-set":[25],"assumptions":[26],"or":[27],"geometry-agnostic":[28],"regression":[29],"schemes,":[30],"limiting":[31],"their":[32],"generalization,":[33,101,136],"stability,":[34],"and":[35,74,92,100,117,137,153],"real-time":[36,154],"applicability":[37],"robotic":[39,139],"systems.":[40],"We":[41],"present":[42],"OMNI-PoseX,":[43],"vision":[45],"foundation":[46],"model":[47,103],"that":[48,81,161],"introduces":[49],"novel":[51],"network":[52],"architecture":[53,66],"unifying":[54],"open-vocabulary":[55],"perception":[56],"with":[57],"an":[58],"SO(3)-aware":[59],"reflected":[60],"flow":[61],"matching":[62],"predictor.":[64],"The":[65,147],"decouples":[67],"object-level":[68],"understanding":[69],"from":[70],"geometry-consistent":[71],"rotation":[72],"inference,":[73],"employs":[75],"lightweight":[77],"multi-modal":[78],"fusion":[79],"strategy":[80],"conditions":[82],"rotation-sensitive":[83],"geometric":[84],"features":[85],"compact":[87],"semantic":[88],"embeddings,":[89],"enabling":[90],"efficient":[91],"stable":[93],"estimation.":[96],"To":[97],"enhance":[98],"robustness":[99],"the":[102,143],"trained":[105],"large-scale":[107],"datasets,":[110],"leveraging":[111],"broad":[112],"diversity,":[114],"viewpoint":[115],"variation,":[116],"scene":[118],"complexity":[119],"to":[120],"build":[121],"scalable":[123],"backbone.":[126],"Comprehensive":[127],"evaluations":[128],"across":[129],"benchmark":[130],"estimation,":[132],"ablation":[133],"studies,":[134],"zero-shot":[135],"system-level":[138],"grasping":[140,164],"integration":[141],"demonstrate":[142],"effectiveness":[144],"of":[145,165],"OMNI-PoseX.":[146],"OMNI-PoseX":[148],"achieves":[149],"SOTA":[150],"accuracy":[152],"efficiency,":[155],"while":[156],"delivering":[157],"geometrically":[158],"consistent":[159],"predictions":[160],"enable":[162],"reliable":[163],"diverse,":[166],"previously":[167],"unseen":[168],"objects.":[169]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-04-07T00:00:00"}
