{"id":"https://openalex.org/W7128519180","doi":"https://doi.org/10.48550/arxiv.2602.08626","title":"Revisiting [CLS] and Patch Token Interaction in Vision Transformers","display_name":"Revisiting [CLS] and Patch Token Interaction in Vision Transformers","publication_year":2026,"publication_date":"2026-02-09","ids":{"openalex":"https://openalex.org/W7128519180","doi":"https://doi.org/10.48550/arxiv.2602.08626"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.08626","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114337728","display_name":"Alexis Marouani","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Marouani, Alexis","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082234499","display_name":"Oriane Sim\u00e9oni","orcid":"https://orcid.org/0000-0003-3232-8978"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sim\u00e9oni, Oriane","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125487360","display_name":"Herv\u00e9 J\u00e9gou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"J\u00e9gou, Herv\u00e9","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Bojanowski, Piotr","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bojanowski, Piotr","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5109581894","display_name":"Huy V. Vo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vo, Huy V.","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5114337728"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.6065000295639038,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.6065000295639038,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.17159999907016754,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.05400000140070915,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.6348999738693237},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6134999990463257},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.49160000681877136},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.4578999876976013},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.4278999865055084},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.4059999883174896},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.3953999876976013}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6798999905586243},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.6348999738693237},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6134999990463257},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5227000117301941},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.49160000681877136},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.4578999876976013},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.4278999865055084},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.4059999883174896},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.3953999876976013},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.3555000126361847},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3352999985218048},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.31779998540878296},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2985000014305115},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2906999886035919},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C141513077","wikidata":"https://www.wikidata.org/wiki/Q378542","display_name":"Independent and identically distributed random variables","level":3,"score":0.2538999915122986}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.08626","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.08626","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.08626","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.08626","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.727422297000885}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision":[0],"Transformers":[1],"have":[2],"emerged":[3],"as":[4],"powerful,":[5],"scalable":[6],"and":[7,15,55,69,104,111,177,185],"versatile":[8],"representation":[9,123],"learners.":[10],"To":[11],"capture":[12],"both":[13,37],"global":[14,54],"local":[16,56],"features,":[17],"a":[18],"learnable":[19],"[CLS]":[20],"class":[21,68,103],"token":[22,38,85],"is":[23],"typically":[24],"prepended":[25],"to":[26,119],"the":[27,44,51,65,99],"input":[28],"sequence":[29],"of":[30,102,135],"patch":[31,70,105,122],"tokens.":[32,71],"Despite":[33],"their":[34],"distinct":[35],"nature,":[36],"types":[39],"are":[40],"processed":[41],"identically":[42],"throughout":[43],"model.":[45],"In":[46],"this":[47,89],"work,":[48],"we":[49,91,166],"investigate":[50],"friction":[52],"between":[53,67,83],"feature":[57],"learning":[58,186],"under":[59],"different":[60],"pre-training":[61],"strategies":[62],"by":[63],"analyzing":[64],"interactions":[66],"Our":[72,129],"analysis":[73],"reveals":[74],"that":[75,96],"standard":[76,141],"normalization":[77,109],"layers":[78,110],"introduce":[79,151],"an":[80,153],"implicit":[81],"differentiation":[82],"these":[84],"types.":[86],"Building":[87],"on":[88,140],"insight,":[90],"propose":[92],"specialized":[93],"processing":[94],"paths":[95],"selectively":[97],"disentangle":[98],"computational":[100,161],"flow":[101],"tokens,":[106],"particularly":[107],"within":[108],"early":[112],"query-key-value":[113],"projections.":[114],"This":[115],"targeted":[116],"specialization":[117,176],"leads":[118],"significantly":[120],"improved":[121],"quality":[124],"for":[125],"dense":[126],"prediction":[127],"tasks.":[128],"experiments":[130],"demonstrate":[131],"segmentation":[132],"performance":[133],"gains":[134],"over":[136],"2":[137],"mIoU":[138],"points":[139],"benchmarks,":[142],"while":[143],"maintaining":[144],"strong":[145],"classification":[146],"accuracy.":[147],"The":[148],"proposed":[149],"modifications":[150],"only":[152],"8%":[154],"increase":[155],"in":[156],"parameters,":[157],"with":[158],"no":[159],"additional":[160],"overhead.":[162],"Through":[163],"comprehensive":[164],"ablations,":[165],"provide":[167],"insights":[168],"into":[169],"which":[170],"architectural":[171],"components":[172],"benefit":[173],"most":[174],"from":[175],"how":[178],"our":[179],"approach":[180],"generalizes":[181],"across":[182],"model":[183],"scales":[184],"frameworks.":[187]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-11T00:00:00"}
