{"id":"https://openalex.org/W4416749400","doi":"https://doi.org/10.1109/iros60139.2025.11247293","title":"PosePilot: Steering Camera Pose for Generative World Models with Self-supervised Depth","display_name":"PosePilot: Steering Camera Pose for Generative World Models with Self-supervised Depth","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4416749400","doi":"https://doi.org/10.1109/iros60139.2025.11247293"},"language":null,"primary_location":{"id":"doi:10.1109/iros60139.2025.11247293","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros60139.2025.11247293","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024635574","display_name":"Bu Jin","orcid":"https://orcid.org/0000-0001-7577-2177"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Bu Jin","raw_affiliation_strings":["The Hong Kong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008088626","display_name":"Weize Li","orcid":"https://orcid.org/0000-0003-3021-7641"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weize Li","raw_affiliation_strings":["Tsinghua University,Institute for AI Industry Research (AIR)"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Institute for AI Industry Research (AIR)","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042316856","display_name":"Bin Yang","orcid":"https://orcid.org/0000-0003-4953-3644"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Baihan Yang","raw_affiliation_strings":["Tsinghua University,Institute for AI Industry Research (AIR)"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Institute for AI Industry Research (AIR)","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101212562","display_name":"Zhenxin Zhu","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenxin Zhu","raw_affiliation_strings":["Tsinghua University,Institute for AI Industry Research (AIR)"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Institute for AI Industry Research (AIR)","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081783696","display_name":"Junpeng Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Junpeng Jiang","raw_affiliation_strings":["Li Auto"],"affiliations":[{"raw_affiliation_string":"Li Auto","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041236976","display_name":"Huan-ang Gao","orcid":"https://orcid.org/0009-0004-6727-5778"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huan-ang Gao","raw_affiliation_strings":["Tsinghua University,Institute for AI Industry Research (AIR)"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Institute for AI Industry Research (AIR)","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100923998","display_name":"Haiyang Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Haiyang Sun","raw_affiliation_strings":["Li Auto"],"affiliations":[{"raw_affiliation_string":"Li Auto","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007528778","display_name":"Kun Zhan","orcid":"https://orcid.org/0000-0002-4208-5220"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kun Zhan","raw_affiliation_strings":["Li Auto"],"affiliations":[{"raw_affiliation_string":"Li Auto","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005543941","display_name":"Hengtong Hu","orcid":"https://orcid.org/0000-0002-3095-6009"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hengtong Hu","raw_affiliation_strings":["Li Auto"],"affiliations":[{"raw_affiliation_string":"Li Auto","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004662248","display_name":"Xueyang Zhang","orcid":"https://orcid.org/0000-0003-3625-219X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xueyang Zhang","raw_affiliation_strings":["Li Auto"],"affiliations":[{"raw_affiliation_string":"Li Auto","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107921149","display_name":"Peng Jia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng Jia","raw_affiliation_strings":["Li Auto"],"affiliations":[{"raw_affiliation_string":"Li Auto","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101213342","display_name":"Hao Zhao","orcid":"https://orcid.org/0000-0002-7578-9910"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Zhao","raw_affiliation_strings":["Tsinghua University,Institute for AI Industry Research (AIR)"],"affiliations":[{"raw_affiliation_string":"Tsinghua University,Institute for AI Industry Research (AIR)","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5024635574"],"corresponding_institution_ids":["https://openalex.org/I200769079"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.40142835,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"8051","last_page":"8058"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.5809000134468079,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.5809000134468079,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.19470000267028809,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.06289999932050705,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/image-warping","display_name":"Image warping","score":0.8730999827384949},{"id":"https://openalex.org/keywords/pose","display_name":"Pose","score":0.6150000095367432},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5160999894142151},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5105999708175659},{"id":"https://openalex.org/keywords/view-synthesis","display_name":"View synthesis","score":0.5001000165939331},{"id":"https://openalex.org/keywords/articulated-body-pose-estimation","display_name":"Articulated body pose estimation","score":0.4526999890804291},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.4343000054359436},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.4302000105381012},{"id":"https://openalex.org/keywords/camera-auto-calibration","display_name":"Camera auto-calibration","score":0.388700008392334}],"concepts":[{"id":"https://openalex.org/C157202957","wikidata":"https://www.wikidata.org/wiki/Q1659609","display_name":"Image warping","level":2,"score":0.8730999827384949},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7833999991416931},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.742900013923645},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6916000247001648},{"id":"https://openalex.org/C52102323","wikidata":"https://www.wikidata.org/wiki/Q1671968","display_name":"Pose","level":2,"score":0.6150000095367432},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5160999894142151},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5105999708175659},{"id":"https://openalex.org/C2776449333","wikidata":"https://www.wikidata.org/wiki/Q7928781","display_name":"View synthesis","level":3,"score":0.5001000165939331},{"id":"https://openalex.org/C22100474","wikidata":"https://www.wikidata.org/wiki/Q4800952","display_name":"Articulated body pose estimation","level":4,"score":0.4526999890804291},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.4343000054359436},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.4302000105381012},{"id":"https://openalex.org/C94816000","wikidata":"https://www.wikidata.org/wiki/Q5026006","display_name":"Camera auto-calibration","level":3,"score":0.388700008392334},{"id":"https://openalex.org/C88516994","wikidata":"https://www.wikidata.org/wiki/Q1268863","display_name":"Dynamic time warping","level":2,"score":0.37389999628067017},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.365200012922287},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.364300012588501},{"id":"https://openalex.org/C172849965","wikidata":"https://www.wikidata.org/wiki/Q3148875","display_name":"Reference frame","level":3,"score":0.3379000127315521},{"id":"https://openalex.org/C48209547","wikidata":"https://www.wikidata.org/wiki/Q1331104","display_name":"Controllability","level":2,"score":0.33500000834465027},{"id":"https://openalex.org/C204241405","wikidata":"https://www.wikidata.org/wiki/Q461499","display_name":"Transformation (genetics)","level":3,"score":0.3183000087738037},{"id":"https://openalex.org/C5799516","wikidata":"https://www.wikidata.org/wiki/Q4110915","display_name":"Visual odometry","level":3,"score":0.30559998750686646},{"id":"https://openalex.org/C36613465","wikidata":"https://www.wikidata.org/wiki/Q4636322","display_name":"3D pose estimation","level":3,"score":0.2922999858856201},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2872999906539917},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.26919999718666077},{"id":"https://openalex.org/C146159030","wikidata":"https://www.wikidata.org/wiki/Q7625099","display_name":"Structure from motion","level":3,"score":0.26829999685287476},{"id":"https://openalex.org/C194995250","wikidata":"https://www.wikidata.org/wiki/Q531136","display_name":"Affordance","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2578999996185303},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.25360000133514404}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iros60139.2025.11247293","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iros60139.2025.11247293","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W2133665775","https://openalex.org/W2300779272","https://openalex.org/W2609883120","https://openalex.org/W2963557767","https://openalex.org/W2963906250","https://openalex.org/W3035574168","https://openalex.org/W3174211490","https://openalex.org/W4312707458","https://openalex.org/W4383108371","https://openalex.org/W4383109298","https://openalex.org/W4386076400","https://openalex.org/W4390871799","https://openalex.org/W4390873407","https://openalex.org/W4391622623","https://openalex.org/W4400573519","https://openalex.org/W4401109681","https://openalex.org/W4401416274","https://openalex.org/W4401810286","https://openalex.org/W4402727021","https://openalex.org/W4402754035","https://openalex.org/W4402778069","https://openalex.org/W4402952251","https://openalex.org/W4403771534","https://openalex.org/W4404722496","https://openalex.org/W4409368599","https://openalex.org/W4413144960","https://openalex.org/W4413513763"],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advancements":[1],"in":[2,14,68,175,203],"autonomous":[3,159],"driving":[4,25,160],"(AD)":[5],"systems":[6],"have":[7],"highlighted":[8],"the":[9,102],"potential":[10],"of":[11,48],"world":[12,70,180,205],"models":[13],"achieving":[15],"robust":[16],"and":[17,23,33,45,90,98,107,146,154,161,172,178],"generalizable":[18],"performance":[19],"across":[20,131],"both":[21,176],"ordinary":[22],"challenging":[24],"conditions.":[26],"However,":[27],"a":[28,57,84,123,142,147,191],"key":[29],"challenge":[30],"remains:":[31],"precise":[32],"flexible":[34],"camera":[35,65,88,109,137,184],"pose":[36,66,89,99,138,148,185,195],"control,":[37],"which":[38],"is":[39],"crucial":[40],"for":[41,194],"accurate":[42],"viewpoint":[43,152,201],"transformation":[44],"realistic":[46],"simulation":[47],"scene":[49],"dynamics.":[50],"In":[51],"this":[52],"paper,":[53],"we":[54,94,140],"introduce":[55,141],"PosePilot,":[56],"lightweight":[58],"yet":[59],"powerful":[60],"framework":[61],"that":[62,127,166],"significantly":[63,168],"enhances":[64,169],"controllability":[67],"generative":[69,204],"models.":[71,181,206],"Drawing":[72],"inspiration":[73],"from":[74,112],"self-supervised":[75,96,187],"depth":[76,97,106],"estimation,":[77,139],"PosePilot":[78,167,189],"leverages":[79],"structure-from-motion":[80],"principles":[81],"to":[82,104],"establish":[83],"tight":[85],"coupling":[86],"between":[87],"video":[91,113,163],"generation.":[92],"Specifically,":[93],"incorporate":[95],"readouts,":[100],"allowing":[101],"model":[103],"infer":[105],"relative":[108],"motion":[110,173],"directly":[111],"sequences.":[114],"These":[115],"outputs":[116],"drive":[117],"pose-aware":[118],"frame":[119],"warping,":[120],"guided":[121],"by":[122],"photometric":[124],"warping":[125,144],"loss":[126],"enforces":[128],"geometric":[129],"consistency":[130],"synthesized":[132],"frames.":[133],"To":[134],"further":[135],"refine":[136],"reverse":[143],"step":[145],"regression":[149],"loss,":[150],"improving":[151],"precision":[153],"adaptability.":[155],"Extensive":[156],"experiments":[157],"on":[158],"general-domain":[162],"datasets":[164],"demonstrate":[165],"structural":[170],"understanding":[171],"reasoning":[174],"diffusion-based":[177],"auto-regressive":[179],"By":[182],"steering":[183],"with":[186],"depth,":[188],"sets":[190],"new":[192],"benchmark":[193],"controllability,":[196],"enabling":[197],"physically":[198],"consistent,":[199],"reliable":[200],"synthesis":[202]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-28T00:00:00"}
