{"id":"https://openalex.org/W7154707461","doi":"https://doi.org/10.48550/arxiv.2604.14629","title":"Switch-KD: Visual-Switch Knowledge Distillation for Vision-Language Models","display_name":"Switch-KD: Visual-Switch Knowledge Distillation for Vision-Language Models","publication_year":2026,"publication_date":"2026-04-16","ids":{"openalex":"https://openalex.org/W7154707461","doi":"https://doi.org/10.48550/arxiv.2604.14629"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.14629","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14629","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.14629","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5106075816","display_name":"Haoyi Sun","orcid":"https://orcid.org/0009-0005-6471-1826"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sun, Haoyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133879170","display_name":"Xiaoxiao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xiaoxiao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133876759","display_name":"Ning Mao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mao, Ning","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133849517","display_name":"Qian Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Qian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016496701","display_name":"Lifu Mu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mu, Lifu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133897520","display_name":"Wen Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Wen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133904457","display_name":"Tao Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Tao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133834468","display_name":"Wei Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Wei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5106075816"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9677000045776367,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9677000045776367,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.005799999926239252,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.004699999932199717,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.6462000012397766},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.64410001039505},{"id":"https://openalex.org/keywords/probabilistic-logic","display_name":"Probabilistic logic","score":0.5623000264167786},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4875999987125397},{"id":"https://openalex.org/keywords/distillation","display_name":"Distillation","score":0.4657999873161316},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.40779998898506165},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.39010000228881836},{"id":"https://openalex.org/keywords/knowledge-transfer","display_name":"Knowledge transfer","score":0.3750999867916107}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7513999938964844},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.6462000012397766},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.64410001039505},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.5623000264167786},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5496000051498413},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4875999987125397},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.4657999873161316},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4361000061035156},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.40779998898506165},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.39010000228881836},{"id":"https://openalex.org/C2776960227","wikidata":"https://www.wikidata.org/wiki/Q2586354","display_name":"Knowledge transfer","level":2,"score":0.3750999867916107},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.3571000099182129},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3447999954223633},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.33559998869895935},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.30959999561309814},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.2992999851703644},{"id":"https://openalex.org/C84685590","wikidata":"https://www.wikidata.org/wiki/Q1540472","display_name":"Knowledge engineering","level":2,"score":0.2840000092983246},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.28279998898506165},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.28029999136924744},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.2685999870300293},{"id":"https://openalex.org/C56814567","wikidata":"https://www.wikidata.org/wiki/Q1323686","display_name":"Explicit knowledge","level":2,"score":0.2630000114440918},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.2612999975681305},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.26019999384880066},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2554999887943268}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.14629","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14629","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.14629","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14629","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language":[0],"Models":[1],"(VLMs)":[2],"have":[3],"shown":[4],"remarkable":[5],"capabilities":[6,33],"in":[7,20,58],"joint":[8],"vision-language":[9,95],"understanding,":[10],"but":[11],"their":[12],"large":[13],"scale":[14],"poses":[15],"significant":[16],"challenges":[17],"for":[18,127],"deployment":[19,42],"resource-constrained":[21],"scenarios.":[22],"Knowledge":[23],"Distillation":[24],"(KD)":[25],"offers":[26],"a":[27,89,99,161],"viable":[28],"way":[29],"to":[30,48,78,122],"improve":[31],"model":[32,36],"without":[34,72,184],"increasing":[35],"size":[37],"or":[38],"data":[39],"requirements,":[40],"making":[41],"more":[43],"efficient.":[44],"However,":[45],"applying":[46],"KD":[47],"VLMs":[49,59],"is":[50,60],"challenged":[51],"by":[52,159],"modality-specific":[53],"supervision:":[54],"although":[55],"multimodal":[56,75,80,167,182],"knowledge":[57,81,96,130,168],"fused":[61],"within":[62,98],"the":[63,113,118,148],"language":[64,120],"space,":[65],"current":[66],"methods":[67],"supervise":[68],"each":[69],"modality":[70],"separately":[71],"explicitly":[73],"addressing":[74],"alignment,":[76],"leading":[77],"inconsistent":[79],"transfer.":[82],"To":[83],"address":[84],"this,":[85],"we":[86],"propose":[87],"Switch-KD,":[88,160],"visual-switch":[90],"distillation":[91],"framework":[92],"that":[93],"unifies":[94],"transfer":[97],"shared":[100],"text-probability":[101],"space.":[102],"Switch-KD":[103],"comprises":[104],"two":[105],"key":[106],"components:":[107],"(1)":[108],"Visual-Switch":[109],"Distillation,":[110],"which":[111,140],"switches":[112],"student's":[114],"visual":[115,129],"outputs":[116],"into":[117],"teacher's":[119],"pathway":[121],"construct":[123],"cross-modal":[124],"probabilistic":[125],"references":[126],"implicit":[128],"transfer;":[131],"and":[132,153],"(2)":[133],"Dynamic":[134],"Bi-directional":[135],"Logits":[136],"Difference":[137],"(DBiLD)":[138],"loss,":[139],"adaptively":[141],"aligns":[142],"informative":[143],"probability":[144],"regions":[145],"while":[146],"preserving":[147],"distributional":[149],"structures":[150],"of":[151,177],"teacher":[152],"student":[154],"through":[155],"bidirectional":[156],"supervision.":[157],"Guided":[158],"0.5B":[162],"TinyLLaVA":[163],"effectively":[164],"distills":[165],"rich":[166],"from":[169],"its":[170],"3B":[171],"teacher,":[172],"yielding":[173],"an":[174],"average":[175],"improvement":[176],"3.6":[178],"points":[179],"across":[180],"10":[181],"benchmarks":[183],"any":[185],"architectural":[186],"modification.":[187]},"counts_by_year":[],"updated_date":"2026-04-18T06:05:20.339008","created_date":"2026-04-18T00:00:00"}
