{"id":"https://openalex.org/W4404307596","doi":"https://doi.org/10.48550/arxiv.2410.18962","title":"Where Am I and What Will I See: An Auto-Regressive Model for Spatial Localization and View Prediction","display_name":"Where Am I and What Will I See: An Auto-Regressive Model for Spatial Localization and View Prediction","publication_year":2024,"publication_date":"2024-10-24","ids":{"openalex":"https://openalex.org/W4404307596","doi":"https://doi.org/10.48550/arxiv.2410.18962"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2410.18962","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.18962","pdf_url":"https://arxiv.org/pdf/2410.18962","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2410.18962","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100738213","display_name":"Junyi Chen","orcid":"https://orcid.org/0000-0001-7448-4499"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Junyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101788179","display_name":"Di Huang","orcid":"https://orcid.org/0000-0002-3809-0123"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Di","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011295525","display_name":"Weicai Ye","orcid":"https://orcid.org/0000-0001-6215-1347"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye, Weicai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087818121","display_name":"Wanli Ouyang","orcid":"https://orcid.org/0000-0002-9163-2761"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ouyang, Wanli","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5103147075","display_name":"Tong He","orcid":"https://orcid.org/0000-0003-3149-7243"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Tong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100738213"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.8744999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.8744999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.866599977016449,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/autoregressive-model","display_name":"Autoregressive model","score":0.5207664370536804},{"id":"https://openalex.org/keywords/econometrics","display_name":"Econometrics","score":0.3726447820663452},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.35425612330436707},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.34679466485977173},{"id":"https://openalex.org/keywords/economics","display_name":"Economics","score":0.24834668636322021}],"concepts":[{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.5207664370536804},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.3726447820663452},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.35425612330436707},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.34679466485977173},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.24834668636322021}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2410.18962","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.18962","pdf_url":"https://arxiv.org/pdf/2410.18962","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2410.18962","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2410.18962","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2410.18962","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.18962","pdf_url":"https://arxiv.org/pdf/2410.18962","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2171218219","https://openalex.org/W1972271943","https://openalex.org/W2150410159","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W2156628102"],"abstract_inverted_index":{"Spatial":[0,84],"intelligence":[1],"is":[2],"the":[3,103,112,121,136,140,178,182],"ability":[4],"of":[5,42,143,163],"a":[6,87,107,115],"machine":[7],"to":[8,73,138,171],"perceive,":[9],"reason,":[10],"and":[11,18,53,96,110,126,146,166,188],"act":[12],"in":[13,22,46,151,174],"three":[14],"dimensions":[15],"within":[16],"space":[17],"time.":[19],"Recent":[20],"advancements":[21],"large-scale":[23],"auto-regressive":[24,89,153],"models":[25,36],"have":[26,61],"demonstrated":[27],"remarkable":[28],"capabilities":[29],"across":[30],"various":[31],"reasoning":[32],"tasks.":[33],"However,":[34],"these":[35],"often":[37],"struggle":[38],"with":[39],"fundamental":[40],"aspects":[41],"spatial":[43,94,124,149,186],"reasoning,":[44],"particularly":[45],"answering":[47],"questions":[48],"like":[49],"\"Where":[50],"am":[51],"I?\"":[52],"\"What":[54],"will":[55],"I":[56],"see?\".":[57],"While":[58],"some":[59],"attempts":[60],"been":[62],"done,":[63],"existing":[64],"approaches":[65],"typically":[66],"treat":[67],"them":[68],"as":[69],"separate":[70],"tasks,":[71,176],"failing":[72],"capture":[74],"their":[75,147],"interconnected":[76],"nature.":[77],"In":[78],"this":[79],"paper,":[80],"we":[81],"present":[82],"Generative":[83],"Transformer":[85],"(GST),":[86],"novel":[88,167],"framework":[90],"that":[91,160],"jointly":[92],"addresses":[93],"localization":[95],"view":[97,113,168],"prediction.":[98,128,190],"Our":[99],"model":[100,137],"simultaneously":[101],"estimates":[102],"camera":[104,117,132],"pose":[105,164],"from":[106,114],"single":[108],"image":[109],"predicts":[111],"new":[116],"pose,":[118],"effectively":[119],"bridging":[120],"gap":[122],"between":[123,185],"awareness":[125,187],"visual":[127,189],"The":[129],"proposed":[130],"innovative":[131],"tokenization":[133],"method":[134],"enables":[135],"learn":[139],"joint":[141,161],"distribution":[142],"2D":[144],"projections":[145],"corresponding":[148],"perspectives":[150],"an":[152],"manner.":[154],"This":[155],"unified":[156],"training":[157],"paradigm":[158],"demonstrates":[159],"optimization":[162],"estimation":[165],"synthesis":[169],"leads":[170],"improved":[172],"performance":[173],"both":[175],"for":[177],"first":[179],"time,":[180],"highlighting":[181],"inherent":[183],"relationship":[184]},"counts_by_year":[],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2024-11-13T00:00:00"}
