{"id":"https://openalex.org/W7133329154","doi":"https://doi.org/10.48550/arxiv.2603.00136","title":"TinyVLM: Zero-Shot Object Detection on Microcontrollers via Vision-Language Distillation with Matryoshka Embeddings","display_name":"TinyVLM: Zero-Shot Object Detection on Microcontrollers via Vision-Language Distillation with Matryoshka Embeddings","publication_year":2026,"publication_date":"2026-02-24","ids":{"openalex":"https://openalex.org/W7133329154","doi":"https://doi.org/10.48550/arxiv.2603.00136"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.00136","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00136","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.00136","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073061800","display_name":"Bibin Wilson","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wilson, Bibin","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5073061800"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.6036999821662903,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.6036999821662903,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.287200003862381,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0430000014603138,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6502000093460083},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.617900013923645},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5264000296592712},{"id":"https://openalex.org/keywords/microcontroller","display_name":"Microcontroller","score":0.5184000134468079},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5059000253677368},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.49810001254081726},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.491100013256073},{"id":"https://openalex.org/keywords/flash","display_name":"Flash (photography)","score":0.3970000147819519},{"id":"https://openalex.org/keywords/memory-footprint","display_name":"Memory footprint","score":0.39410001039505005}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8432000279426575},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6502000093460083},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.617900013923645},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5321999788284302},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5264000296592712},{"id":"https://openalex.org/C173018170","wikidata":"https://www.wikidata.org/wiki/Q165678","display_name":"Microcontroller","level":2,"score":0.5184000134468079},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5059000253677368},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.49810001254081726},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.491100013256073},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.43140000104904175},{"id":"https://openalex.org/C2777526259","wikidata":"https://www.wikidata.org/wiki/Q221836","display_name":"Flash (photography)","level":2,"score":0.3970000147819519},{"id":"https://openalex.org/C74912251","wikidata":"https://www.wikidata.org/wiki/Q6815727","display_name":"Memory footprint","level":2,"score":0.39410001039505005},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.35850000381469727},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.35109999775886536},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.3504999876022339},{"id":"https://openalex.org/C112313634","wikidata":"https://www.wikidata.org/wiki/Q7886648","display_name":"Complement (music)","level":5,"score":0.34549999237060547},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.32589998841285706},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.31929999589920044},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.30799999833106995},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.3050000071525574},{"id":"https://openalex.org/C203479927","wikidata":"https://www.wikidata.org/wiki/Q5165939","display_name":"Controller (irrigation)","level":2,"score":0.30489999055862427},{"id":"https://openalex.org/C50942859","wikidata":"https://www.wikidata.org/wiki/Q4967193","display_name":"Rectification","level":3,"score":0.3010999858379364},{"id":"https://openalex.org/C2778751112","wikidata":"https://www.wikidata.org/wiki/Q835016","display_name":"Window (computing)","level":2,"score":0.28769999742507935},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2847999930381775},{"id":"https://openalex.org/C186967261","wikidata":"https://www.wikidata.org/wiki/Q5082128","display_name":"Mobile device","level":2,"score":0.27790001034736633},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2736999988555908},{"id":"https://openalex.org/C75294576","wikidata":"https://www.wikidata.org/wiki/Q5165192","display_name":"Contextual image classification","level":3,"score":0.26589998602867126},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.25360000133514404},{"id":"https://openalex.org/C2780992000","wikidata":"https://www.wikidata.org/wiki/Q17016113","display_name":"Generator (circuit theory)","level":3,"score":0.25099998712539673}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.00136","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00136","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.00136","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00136","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Zero-shot":[0],"object":[1,47],"detection":[2,48,170],"enables":[3],"recognising":[4],"novel":[5],"objects":[6],"without":[7],"task-specific":[8],"training,":[9],"but":[10],"current":[11],"approaches":[12],"rely":[13],"on":[14,49,117,127,155,161,171],"large":[15],"vision":[16,146],"language":[17],"models":[18],"(VLMs)":[19],"like":[20],"CLIP":[21],"that":[22,68,88,105],"require":[23],"hundreds":[24],"of":[25,27,34,56,136,140],"megabytes":[26],"memory":[28,109,142],"-":[29],"far":[30],"exceeding":[31],"the":[32,42,144,175],"constraints":[33],"micro":[35],"controller":[36],"units":[37],"(MCUs).":[38],"We":[39,148],"present":[40],"TinyVLM,":[41],"first":[43,176],"framework":[44],"enabling":[45,96,167],"zero-shot":[46,125,169],"resource-constrained":[50],"MCUs":[51],"with":[52,112,163],"less":[53],"than":[54],"1MB":[55],"memory.":[57],"Our":[58],"approach":[59],"introduces":[60],"three":[61],"key":[62],"innovations:":[63],"(1)":[64],"a":[65],"decoupled":[66],"architecture":[67],"separates":[69],"visual":[70],"inference":[71,151],"from":[72],"text":[73],"encoding,":[74],"allowing":[75],"precomputed":[76],"class":[77,107],"embeddings":[78,91],"to":[79],"be":[80],"stored":[81],"in":[82],"flash":[83,141],"memory;":[84],"(2)":[85],"Matryoshka":[86],"distillation":[87],"trains":[89],"nested":[90],"at":[92,152],"multiple":[93],"dimensions":[94],"(16-256),":[95],"flexible":[97],"accuracy-memory":[98],"trade-offs;":[99],"and":[100,130,138,157],"(3)":[101],"quantized":[102],"embedding":[103],"storage":[104],"reduces":[106],"prototype":[108],"by":[110],"4x":[111],"minimal":[113],"accuracy":[114,126],"loss.":[115],"Trained":[116],"Conceptual":[118],"Captions":[119],"3M":[120],"(CC3M),":[121],"TinyVLM":[122],"achieves":[123],"competitive":[124],"COCO,":[128],"Flowers102,":[129],"Food101":[131],"while":[132],"requiring":[133],"only":[134],"285KB":[135],"RAM":[137],"892KB":[139],"for":[143,174],"deployed":[145],"encoder.":[147],"demonstrate":[149],"real-time":[150],"26":[153],"FPS":[154,160],"STM32H7":[156],"over":[158],"1,000":[159],"MAX78000":[162],"its":[164],"CNN":[165],"accelerator,":[166],"practical":[168],"edge":[172],"devices":[173],"time.":[177]},"counts_by_year":[],"updated_date":"2026-03-04T07:09:34.246503","created_date":"2026-03-04T00:00:00"}
