{"id":"https://openalex.org/W7164806335","doi":"https://doi.org/10.1145/3805622.3810664","title":"DINOTrack: Leverage Differential Attention for Noise-Aware Visual Tracking with DINOv3","display_name":"DINOTrack: Leverage Differential Attention for Noise-Aware Visual Tracking with DINOv3","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164806335","doi":"https://doi.org/10.1145/3805622.3810664"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810664","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810664","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810664","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013867024","display_name":"Wei Xu","orcid":"https://orcid.org/0000-0001-9341-8382"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Xu","raw_affiliation_strings":["Guangzhou Institute of Technology, Xidian University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0009-0004-3865-6168","affiliations":[{"raw_affiliation_string":"Guangzhou Institute of Technology, Xidian University, Guangzhou, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108888999","display_name":"Gu Geng","orcid":"https://orcid.org/0009-0000-9834-524X"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Gu Geng","raw_affiliation_strings":["Guangzhou Institute of Technology, Xidian University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0009-0000-9834-524X","affiliations":[{"raw_affiliation_string":"Guangzhou Institute of Technology, Xidian University, Guangzhou, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019455734","display_name":"Di Yuan","orcid":"https://orcid.org/0000-0001-9403-1112"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Di Yuan","raw_affiliation_strings":["Guangzhou Institute of Technology, Xidian University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-9403-1112","affiliations":[{"raw_affiliation_string":"Guangzhou Institute of Technology, Xidian University, Guangzhou, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100403637","display_name":"Rui Chen","orcid":"https://orcid.org/0000-0002-3690-7902"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rui Chen","raw_affiliation_strings":["Guangzhou Institute of Technology, Xidian University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-3690-7902","affiliations":[{"raw_affiliation_string":"Guangzhou Institute of Technology, Xidian University, Guangzhou, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100393714","display_name":"Qiao Liu","orcid":"https://orcid.org/0000-0003-0885-7976"},"institutions":[{"id":"https://openalex.org/I126924076","display_name":"Chongqing Normal University","ror":"https://ror.org/01dcw5w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I126924076"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qiao Liu","raw_affiliation_strings":["National Center for Applied Mathematics in Chongqing, Chongqing Normal University, Chongqing, China"],"raw_orcid":"https://orcid.org/0000-0003-0885-7976","affiliations":[{"raw_affiliation_string":"National Center for Applied Mathematics in Chongqing, Chongqing Normal University, Chongqing, China","institution_ids":["https://openalex.org/I126924076"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93372047,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1450","last_page":"1458"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9217000007629395,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9217000007629395,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11707","display_name":"Gaze Tracking and Assistive Technology","score":0.027799999341368675,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.004100000020116568,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.8202000260353088},{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.6151000261306763},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5710999965667725},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.4715999960899353},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4668000042438507},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.42899999022483826},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.41110000014305115},{"id":"https://openalex.org/keywords/probabilistic-logic","display_name":"Probabilistic logic","score":0.38280001282691956},{"id":"https://openalex.org/keywords/eye-tracking","display_name":"Eye tracking","score":0.36469998955726624}],"concepts":[{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.8202000260353088},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7483999729156494},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6509000062942505},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.6151000261306763},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5710999965667725},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.4715999960899353},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4668000042438507},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.44530001282691956},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.42899999022483826},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.41110000014305115},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.38280001282691956},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.36469998955726624},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3594000041484833},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.35269999504089355},{"id":"https://openalex.org/C2781122975","wikidata":"https://www.wikidata.org/wiki/Q16928266","display_name":"Semantic feature","level":2,"score":0.3212999999523163},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.3140000104904175},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2903999984264374},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2897999882698059},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.28839999437332153},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.28529998660087585},{"id":"https://openalex.org/C117251300","wikidata":"https://www.wikidata.org/wiki/Q1849855","display_name":"Parametric statistics","level":2,"score":0.2808000147342682},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.2777999937534332},{"id":"https://openalex.org/C93226319","wikidata":"https://www.wikidata.org/wiki/Q193137","display_name":"Differential (mechanical device)","level":2,"score":0.2694000005722046},{"id":"https://openalex.org/C12267149","wikidata":"https://www.wikidata.org/wiki/Q282453","display_name":"Support vector machine","level":2,"score":0.25999999046325684},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.25859999656677246},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.25209999084472656}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810664","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810664","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810664","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810664","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","score":0.7775090932846069,"display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W1490237754","https://openalex.org/W1857884451","https://openalex.org/W1861492603","https://openalex.org/W2470394683","https://openalex.org/W2557641257","https://openalex.org/W2794744029","https://openalex.org/W2799058067","https://openalex.org/W2891033863","https://openalex.org/W2898200825","https://openalex.org/W2962766617","https://openalex.org/W2963351448","https://openalex.org/W2963534981","https://openalex.org/W3001584168","https://openalex.org/W3010738020","https://openalex.org/W3035524453","https://openalex.org/W3090155371","https://openalex.org/W3108519869","https://openalex.org/W3109466111","https://openalex.org/W3158472981","https://openalex.org/W3159481202","https://openalex.org/W3167536469","https://openalex.org/W3181069167","https://openalex.org/W3204540098","https://openalex.org/W3204554907","https://openalex.org/W3214586131","https://openalex.org/W4214759957","https://openalex.org/W4312323989","https://openalex.org/W4312532041","https://openalex.org/W4312651322","https://openalex.org/W4312668764","https://openalex.org/W4312751983","https://openalex.org/W4313156423","https://openalex.org/W4376133601","https://openalex.org/W4390577959","https://openalex.org/W4400943531","https://openalex.org/W4409019071"],"related_works":[],"abstract_inverted_index":{"Despite":[0],"significant":[1,199],"progress":[2],"in":[3],"Transformer-based":[4],"visual":[5],"tracking,":[6],"standard":[7],"self-attention":[8],"mechanisms":[9],"inherently":[10],"lack":[11],"the":[12,24,29,44,57,106,115,136,177],"discriminative":[13,92],"power":[14],"required":[15],"for":[16,113,164],"accurate":[17],"template":[18],"search":[19],"and":[20,64,150,210],"matching.":[21,192],"Due":[22],"to":[23,35,43,59,67,83,120],"global":[25],"normalization":[26],"of":[27],"softmax,":[28],"model":[30,107],"inevitably":[31],"assigns":[32],"probabilistic":[33],"quality":[34],"background":[36,86,148],"noise":[37,122,166,184],"terms":[38],"that":[39,196],"are":[40],"semantically":[41],"similar":[42],"target.":[45],"This":[46],"\"attention":[47],"noise\"":[48],"not":[49],"only":[50],"reduces":[51],"tracking":[52,80],"accuracy":[53],"but":[54],"also":[55],"forces":[56],"network":[58],"require":[60],"substantial":[61],"training":[62],"data":[63],"computational":[65],"costs":[66],"learn":[68],"robust":[69,109],"feature":[70,103,124],"suppression.":[71],"To":[72],"address":[73],"this,":[74],"we":[75,89,126,156],"propose":[76],"DINOTrack,":[77],"a":[78,91,96,128,141,158,170],"noise-aware":[79],"framework":[81],"designed":[82],"systematically":[84],"eliminate":[85],"clutter.":[87],"First,":[88],"construct":[90],"representation":[93],"foundation":[94],"using":[95],"frozen":[97],"DINOv3":[98],"backbone":[99],"with":[100],"hierarchical":[101],"multi-level":[102],"fusion,":[104],"ensuring":[105],"captures":[108],"semantic":[110,142,187],"features":[111],"essential":[112],"distinguishing":[114],"target":[116,153,191],"from":[117],"distractors.":[118],"Second,":[119],"mitigate":[121],"before":[123],"interaction,":[125],"design":[127],"Class-Guided":[129],"Heatmap":[130],"Modulation":[131],"(CGHM)":[132],"module.":[133],"By":[134,168],"utilizing":[135],"template\u2019s":[137],"class":[138],"label":[139],"as":[140,205],"prior,":[143],"this":[144],"module":[145,179],"explicitly":[146],"suppresses":[147],"interference":[149],"enhances":[151],"potential":[152],"responses.":[154],"Crucially,":[155],"introduce":[157],"Differential":[159],"Denoising":[160],"Interaction":[161],"(DDI)":[162],"mechanism":[163],"active":[165],"cancellation.":[167],"performing":[169],"difference":[171],"operation":[172],"on":[173,202],"dual":[174],"query-key":[175],"pairs,":[176],"DDI":[178],"effectively":[180],"eliminates":[181],"shared":[182],"attention":[183],"caused":[185],"by":[186],"aliasing,":[188],"enabling":[189],"precise":[190],"Extensive":[193],"experiments":[194],"demonstrate":[195],"DINOTrack":[197],"achieves":[198],"performance":[200],"improvements":[201],"datasets":[203],"such":[204],"LaSOT,":[206],"GOT-10k,":[207],"TNL2K,":[208],"LaSOText,":[209],"TrackingNet.":[211]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
