{"id":"https://openalex.org/W4416366572","doi":"https://doi.org/10.1109/access.2025.3634778","title":"SSF4VSU: A Self-Supervised Synergetic Framework for Visual Scene Understanding","display_name":"SSF4VSU: A Self-Supervised Synergetic Framework for Visual Scene Understanding","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416366572","doi":"https://doi.org/10.1109/access.2025.3634778"},"language":"en","primary_location":{"id":"doi:10.1109/access.2025.3634778","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3634778","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1109/access.2025.3634778","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5043426188","display_name":"Saif Hassan","orcid":"https://orcid.org/0000-0001-9040-3177"},"institutions":[{"id":"https://openalex.org/I68288478","display_name":"Sukkur IBA University","ror":"https://ror.org/03e5jvk98","country_code":"PK","type":"education","lineage":["https://openalex.org/I68288478"]}],"countries":["PK"],"is_corresponding":true,"raw_author_name":"Saif Hassan","raw_affiliation_strings":["Department of Computer Science, Sukkur IBA University, Sukkur, Pakistan"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Sukkur IBA University, Sukkur, Pakistan","institution_ids":["https://openalex.org/I68288478"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101607094","display_name":"Ghulam Mujtaba","orcid":"https://orcid.org/0000-0002-1563-1142"},"institutions":[{"id":"https://openalex.org/I68288478","display_name":"Sukkur IBA University","ror":"https://ror.org/03e5jvk98","country_code":"PK","type":"education","lineage":["https://openalex.org/I68288478"]}],"countries":["PK"],"is_corresponding":false,"raw_author_name":"Ghulam Mujtaba","raw_affiliation_strings":["Department of Computer Science, Sukkur IBA University, Sukkur, Pakistan"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Sukkur IBA University, Sukkur, Pakistan","institution_ids":["https://openalex.org/I68288478"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044793388","display_name":"Habib Ullah","orcid":"https://orcid.org/0000-0002-2434-0849"},"institutions":[{"id":"https://openalex.org/I54108979","display_name":"Norwegian University of Life Sciences","ror":"https://ror.org/04a1mvv97","country_code":"NO","type":"education","lineage":["https://openalex.org/I54108979"]}],"countries":["NO"],"is_corresponding":false,"raw_author_name":"Habib Ullah","raw_affiliation_strings":["Faculty of Science and Technology (REALTEK), Norwegian University of Life Sciences (NMBU), &#x00C5;s, Norway"],"affiliations":[{"raw_affiliation_string":"Faculty of Science and Technology (REALTEK), Norwegian University of Life Sciences (NMBU), &#x00C5;s, Norway","institution_ids":["https://openalex.org/I54108979"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044015235","display_name":"Ali Shariq Imran","orcid":"https://orcid.org/0000-0002-2416-2878"},"institutions":[{"id":"https://openalex.org/I204778367","display_name":"Norwegian University of Science and Technology","ror":"https://ror.org/05xg72x27","country_code":"NO","type":"education","lineage":["https://openalex.org/I204778367"]}],"countries":["NO"],"is_corresponding":false,"raw_author_name":"Ali Shariq Imran","raw_affiliation_strings":["Department of Computer Science (IDI), Intelligent Systems and Analytics (ISA) Research Group, Norwegian University of Science and Technology (NTNU), Gj&#x00D8;vik, Norway","Norwegian University of Science &#x0026; Technology (NTNU), Gj&#x00F8;vik, Norway"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science (IDI), Intelligent Systems and Analytics (ISA) Research Group, Norwegian University of Science and Technology (NTNU), Gj&#x00D8;vik, Norway","institution_ids":["https://openalex.org/I204778367"]},{"raw_affiliation_string":"Norwegian University of Science &#x0026; Technology (NTNU), Gj&#x00F8;vik, Norway","institution_ids":["https://openalex.org/I204778367"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012429784","display_name":"Ahmet Soylu","orcid":"https://orcid.org/0000-0001-6034-4137"},"institutions":[{"id":"https://openalex.org/I2800207870","display_name":"H\u00f8yskolen Kristiania","ror":"https://ror.org/03gss5916","country_code":"NO","type":"education","lineage":["https://openalex.org/I2800207870"]}],"countries":["NO"],"is_corresponding":false,"raw_author_name":"Ahmet Soylu","raw_affiliation_strings":["School of Economics, Innovation, and Technology (SEIT), Kristiania University of Applied Sciences, Oslo, Norway"],"affiliations":[{"raw_affiliation_string":"School of Economics, Innovation, and Technology (SEIT), Kristiania University of Applied Sciences, Oslo, Norway","institution_ids":["https://openalex.org/I2800207870"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5062997767","display_name":"Mohib Ullah","orcid":"https://orcid.org/0000-0002-0222-6340"},"institutions":[{"id":"https://openalex.org/I204778367","display_name":"Norwegian University of Science and Technology","ror":"https://ror.org/05xg72x27","country_code":"NO","type":"education","lineage":["https://openalex.org/I204778367"]}],"countries":["NO"],"is_corresponding":false,"raw_author_name":"Mohib Ullah","raw_affiliation_strings":["Department of Computer Science (IDI), Intelligent Systems and Analytics (ISA) Research Group, Norwegian University of Science and Technology (NTNU), Gj&#x00D8;vik, Norway","Norwegian University of Science &#x0026; Technology (NTNU), Gj&#x00F8;vik, Norway"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science (IDI), Intelligent Systems and Analytics (ISA) Research Group, Norwegian University of Science and Technology (NTNU), Gj&#x00D8;vik, Norway","institution_ids":["https://openalex.org/I204778367"]},{"raw_affiliation_string":"Norwegian University of Science &#x0026; Technology (NTNU), Gj&#x00F8;vik, Norway","institution_ids":["https://openalex.org/I204778367"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5043426188"],"corresponding_institution_ids":["https://openalex.org/I68288478"],"apc_list":{"value":1850,"currency":"USD","value_usd":1850},"apc_paid":{"value":1850,"currency":"USD","value_usd":1850},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.38164881,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"13","issue":null,"first_page":"197544","last_page":"197561"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9079999923706055,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9079999923706055,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11707","display_name":"Gaze Tracking and Assistive Technology","score":0.01269999984651804,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.009999999776482582,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.685699999332428},{"id":"https://openalex.org/keywords/video-tracking","display_name":"Video tracking","score":0.508400022983551},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.4832000136375427},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.47589999437332153},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.43059998750686646},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.4113999903202057},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.38519999384880066},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.3671000003814697}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8738999962806702},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.685699999332428},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6599000096321106},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.508400022983551},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.49129998683929443},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4832000136375427},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.47589999437332153},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.43059998750686646},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.4113999903202057},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.38519999384880066},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.3671000003814697},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.3490999937057495},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.3124000132083893},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.31119999289512634},{"id":"https://openalex.org/C2775936607","wikidata":"https://www.wikidata.org/wiki/Q466845","display_name":"Tracking (education)","level":2,"score":0.3028999865055084},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.2847999930381775},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.28349998593330383},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2822999954223633},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2648000121116638},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.25690001249313354},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.2526000142097473}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/access.2025.3634778","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3634778","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:db918a7456064c139958adc4437aa0d9","is_oa":true,"landing_page_url":"https://doaj.org/article/db918a7456064c139958adc4437aa0d9","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"IEEE Access, Vol 13, Pp 197544-197561 (2025)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1109/access.2025.3634778","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3634778","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Visual":[0],"scene":[1,295],"understanding":[2,296],"involves":[3],"multiple":[4],"tasks,":[5,77],"including":[6],"single-object":[7],"tracking":[8,11,268],"(SOT),":[9],"multi-object":[10],"(MOT),":[12],"video":[13,110,304],"object":[14],"segmentation":[15,215,270],"(VOS),":[16],"and":[17,21,46,63,88,101,126,140,152,157,160,177,206,217,230,252,269,286,297],"its":[18],"Multi-Object":[19],"Tracking":[20],"Segmentation":[22],"(MOTS)":[23],"variant,":[24],"which":[25],"have":[26],"traditionally":[27],"been":[28],"studied":[29],"in":[30],"isolation.":[31],"Such":[32],"fragmentation":[33],"leads":[34],"to":[35,83,96,189],"task-specific":[36,193],"architectures":[37],"that":[38,59,107,136,239,260],"must":[39],"be":[40],"retrained":[41],"for":[42,75,112,243,249,301],"each":[43],"new":[44],"scenario":[45],"rely":[47],"heavily":[48],"on":[49,155,163,180,185,204,211,228,233,280],"annotated":[50],"data.":[51],"Thiswork":[52],"proposes":[53],"a":[54,67,71,78,91,102,117,261],"synergetic":[55],"model":[56,264],"called":[57],"SSF4VSU":[58,65,137,198,289],"simultaneously":[60],"addresses":[61],"SOT,MOT,VOS,":[62],"MOTS.":[64,235],"employs":[66],"shared":[68],"backbone":[69],"with":[70,120],"unified":[72,141,218,263],"embedding":[73],"space":[74],"different":[76],"Temporal":[79,92],"Attention":[80],"Module":[81,94],"(TAM)":[82],"align":[84],"features":[85],"across":[86,131],"frames":[87],"resist":[89],"occlusions,":[90],"Consistency":[93],"(TCM)":[95],"enforce":[97],"smooth":[98],"identity":[99,250],"preservation,":[100],"self-supervised":[103],"learning":[104],"(SSL)":[105],"branch":[106],"leverages":[108],"unlabeled":[109],"sequences":[111],"improved":[113],"generalization.":[114,256],"Training":[115],"follows":[116],"multi-task":[118,303],"curriculum":[119],"dynamic":[121],"loss":[122],"balancing,":[123],"demonstrating":[124],"efficiency":[125],"scalability.":[127],"A":[128],"comprehensive":[129],"evaluation":[130],"six":[132],"public":[133],"benchmarks":[134,146],"shows":[135],"surpasses":[138],"specialised":[139,169],"state-of-the-art":[142],"models.":[143],"On":[144,195],"SOT":[145],"it":[147,173,187,224],"achieves":[148,225],"74.7%":[149],"success":[150],"AUC":[151,159],"80.4%":[153],"precision":[154,162],"LaSOT,":[156],"85.8%":[158],"84.3%":[161],"TrackingNet,":[164],"matching":[165],"or":[166,275],"exceeding":[167],"recent":[168],"trackers.":[170],"For":[171],"MOT,":[172],"records":[174],"82.9%":[175],"MOTA":[176],"83.3%":[178],"IDF1":[179],"the":[181,221,291,299],"MOT17":[182],"dataset,":[183],"while":[184],"BDD100K":[186,234],"generalises":[188],"driving":[190],"scenes":[191],"without":[192,272],"tuning.":[194],"VOS":[196],"benchmarks,":[197],"delivers":[199],"93.3%":[200],"<italic":[201,208],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[202,209],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">J&amp;F</i>":[203,210],"DAVIS-2016":[205],"89.0%":[207],"DAVIS-2017,":[212],"surpassing":[213],"strong":[214],"methods":[216],"baselines.":[219],"In":[220],"MOTS":[222],"setting":[223],"69.0%":[226],"sMOTSA":[227],"MOTS20":[229],"31.2%":[231],"mMOTSA":[232],"Ablation":[236],"studies":[237],"reveal":[238],"TAM":[240],"is":[241,247],"critical":[242],"accurate":[244],"localisation,":[245],"TCM":[246],"indispensable":[248],"stability,":[251],"SSL":[253],"consistently":[254],"improves":[255],"The":[257],"results":[258],"demonstrate":[259],"single":[262],"can":[265],"perform":[266],"diverse":[267],"tasks":[271],"sacrificing":[273],"accuracy":[274],"efficiency.":[276],"By":[277],"reducing":[278],"reliance":[279],"labelled":[281],"data,":[282],"improving":[283],"temporal":[284],"reasoning,":[285],"ensuring":[287],"interpretability,":[288],"advances":[290],"state":[292],"of":[293],"visual":[294],"lays":[298],"foundation":[300],"general-purpose,":[302],"analysis.":[305]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-19T00:00:00"}
