{"id":"https://openalex.org/W4416707012","doi":"https://doi.org/10.1109/tcsvt.2025.3637304","title":"SwinFVO: Self-Supervised Visual Odometry With Enhanced Global Spatiotemporal Perception","display_name":"SwinFVO: Self-Supervised Visual Odometry With Enhanced Global Spatiotemporal Perception","publication_year":2025,"publication_date":"2025-11-26","ids":{"openalex":"https://openalex.org/W4416707012","doi":"https://doi.org/10.1109/tcsvt.2025.3637304"},"language":null,"primary_location":{"id":"doi:10.1109/tcsvt.2025.3637304","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2025.3637304","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5054245629","display_name":"Rujun Song","orcid":"https://orcid.org/0000-0002-8559-7541"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Rujun Song","raw_affiliation_strings":["School of Information and Communication Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, China","School of Information and Communication Engineering, University of Electronic Science and Technology of China, Sichuan, China"],"raw_orcid":"https://orcid.org/0000-0002-8559-7541","affiliations":[{"raw_affiliation_string":"School of Information and Communication Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, China","institution_ids":["https://openalex.org/I150229711"]},{"raw_affiliation_string":"School of Information and Communication Engineering, University of Electronic Science and Technology of China, Sichuan, China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027057491","display_name":"Ruoqi Li","orcid":"https://orcid.org/0000-0003-3730-9370"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruoqi Li","raw_affiliation_strings":["School of Information and Communication Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, China","School of Information and Communication Engineering, University of Electronic Science and Technology of China, Sichuan, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Information and Communication Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, China","institution_ids":["https://openalex.org/I150229711"]},{"raw_affiliation_string":"School of Information and Communication Engineering, University of Electronic Science and Technology of China, Sichuan, China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103052017","display_name":"Zhuoling Xiao","orcid":"https://orcid.org/0000-0002-8118-2330"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhuoling Xiao","raw_affiliation_strings":["School of Information and Communication Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, China","School of Information and Communication Engineering, University of Electronic Science and Technology of China, Sichuan, China"],"raw_orcid":"https://orcid.org/0000-0002-8118-2330","affiliations":[{"raw_affiliation_string":"School of Information and Communication Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, China","institution_ids":["https://openalex.org/I150229711"]},{"raw_affiliation_string":"School of Information and Communication Engineering, University of Electronic Science and Technology of China, Sichuan, China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101804797","display_name":"Bo Yan","orcid":"https://orcid.org/0000-0001-5692-3486"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bo Yan","raw_affiliation_strings":["School of Information and Communication Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, China","School of Information and Communication Engineering, University of Electronic Science and Technology of China, Sichuan, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Information and Communication Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, China","institution_ids":["https://openalex.org/I150229711"]},{"raw_affiliation_string":"School of Information and Communication Engineering, University of Electronic Science and Technology of China, Sichuan, China","institution_ids":["https://openalex.org/I150229711"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5054245629"],"corresponding_institution_ids":["https://openalex.org/I150229711"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.43334701,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"36","issue":"4","first_page":"5258","last_page":"5273"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.664900004863739,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.664900004863739,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.19820000231266022,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10868","display_name":"Soft Robotics and Applications","score":0.028999999165534973,"subfield":{"id":"https://openalex.org/subfields/2204","display_name":"Biomedical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/visual-odometry","display_name":"Visual odometry","score":0.8614000082015991},{"id":"https://openalex.org/keywords/optical-flow","display_name":"Optical flow","score":0.7114999890327454},{"id":"https://openalex.org/keywords/monocular","display_name":"Monocular","score":0.6323999762535095},{"id":"https://openalex.org/keywords/odometry","display_name":"Odometry","score":0.6164000034332275},{"id":"https://openalex.org/keywords/motion-estimation","display_name":"Motion estimation","score":0.5389000177383423},{"id":"https://openalex.org/keywords/pose","display_name":"Pose","score":0.45750001072883606},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.4253999888896942},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.39890000224113464}],"concepts":[{"id":"https://openalex.org/C5799516","wikidata":"https://www.wikidata.org/wiki/Q4110915","display_name":"Visual odometry","level":3,"score":0.8614000082015991},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.8246999979019165},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7534000277519226},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.7501000165939331},{"id":"https://openalex.org/C155542232","wikidata":"https://www.wikidata.org/wiki/Q736111","display_name":"Optical flow","level":3,"score":0.7114999890327454},{"id":"https://openalex.org/C65909025","wikidata":"https://www.wikidata.org/wiki/Q1945033","display_name":"Monocular","level":2,"score":0.6323999762535095},{"id":"https://openalex.org/C49441653","wikidata":"https://www.wikidata.org/wiki/Q2014717","display_name":"Odometry","level":4,"score":0.6164000034332275},{"id":"https://openalex.org/C10161872","wikidata":"https://www.wikidata.org/wiki/Q557891","display_name":"Motion estimation","level":2,"score":0.5389000177383423},{"id":"https://openalex.org/C52102323","wikidata":"https://www.wikidata.org/wiki/Q1671968","display_name":"Pose","level":2,"score":0.45750001072883606},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.4253999888896942},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.39890000224113464},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.3702000081539154},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.34220001101493835},{"id":"https://openalex.org/C146159030","wikidata":"https://www.wikidata.org/wiki/Q7625099","display_name":"Structure from motion","level":3,"score":0.33640000224113464},{"id":"https://openalex.org/C2983787585","wikidata":"https://www.wikidata.org/wiki/Q93586","display_name":"Feature matching","level":3,"score":0.32670000195503235},{"id":"https://openalex.org/C19966478","wikidata":"https://www.wikidata.org/wiki/Q4810574","display_name":"Mobile robot","level":3,"score":0.32089999318122864},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.30889999866485596},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3021000027656555},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.28850001096725464},{"id":"https://openalex.org/C81074085","wikidata":"https://www.wikidata.org/wiki/Q366872","display_name":"Motion planning","level":3,"score":0.28290000557899475},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.26249998807907104}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tcsvt.2025.3637304","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2025.3637304","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Pose":[0],"estimation":[1,168],"using":[2],"visual":[3,19,78],"sensors":[4],"has":[5,22],"become":[6],"a":[7,75,105,124],"fundamental":[8],"component":[9],"in":[10,49,163],"robotic":[11],"navigation":[12],"and":[13,34,58,137,145,158,166,177,187,192],"autonomous":[14],"driving":[15],"systems.":[16],"Learning-based":[17],"monocular":[18],"odometry":[20,79],"(VO)":[21],"attracted":[23],"substantial":[24],"attention":[25],"due":[26],"to":[27,30,86,117,123,185],"its":[28],"resilience":[29],"camera":[31,39],"parameter":[32],"variations":[33],"dynamic":[35],"environments.":[36],"Given":[37],"that":[38,81],"movement":[40],"manifests":[41],"as":[42],"pixel-level":[43],"motion":[44,84,97],"across":[45,151],"the":[46,119,131,155],"entire":[47],"image":[48],"optical":[50],"flow":[51],"data,":[52],"capturing":[53],"both":[54,164],"global":[55,88],"contextual":[56],"information":[57],"local":[59],"feature":[60,102,113],"details":[61],"is":[62],"crucial":[63],"for":[64,127],"accurate":[65,144],"pose":[66,147,165],"estimation.":[67,148],"To":[68],"address":[69],"this":[70],"challenge,":[71],"we":[72,99],"propose":[73],"SwinFVO,":[74],"novel":[76],"self-supervised":[77],"framework":[80,126],"incorporates":[82],"enhanced":[83],"perception":[85],"achieve":[87],"spatial":[89,135],"dependency":[90],"modeling":[91],"with":[92],"temporal":[93,140],"continuity.":[94],"Leveraging":[95],"quadrant-based":[96],"characteristics,":[98],"perform":[100],"cross-regional":[101],"interaction":[103],"through":[104],"refined":[106],"Swin":[107,121],"Transformer":[108,122],"architecture.":[109],"Two":[110],"robust":[111],"spatiotemporal":[112],"extractors":[114],"are":[115],"designed":[116],"extend":[118],"single-frame-based":[120],"temporally-aware":[125],"sequential":[128],"understanding.":[129],"Through":[130],"exploration":[132],"of":[133,139,161],"long-range":[134],"correlations":[136],"preservation":[138],"consistency,":[141],"SwinFVO":[142,162],"delivers":[143],"consistent":[146],"Extensive":[149],"experiments":[150],"multiple":[152],"datasets":[153],"demonstrate":[154],"superior":[156],"performance":[157],"generalization":[159],"capability":[160],"depth":[167],"tasks.":[169],"It":[170],"achieves":[171],"competitive":[172],"results":[173],"against":[174],"classical":[175],"algorithms":[176],"outperforms":[178],"related":[179],"state-of-the-art":[180],"(SOTA)":[181],"methods":[182],"by":[183],"up":[184],"20.6%":[186],"72.4%":[188],"on":[189],"average":[190],"translational":[191],"rotational":[193],"evaluations,":[194],"respectively.":[195]},"counts_by_year":[],"updated_date":"2026-04-07T06:01:17.266235","created_date":"2025-11-27T00:00:00"}
