{"id":"https://openalex.org/W7154263945","doi":"https://doi.org/10.48550/arxiv.2604.09920","title":"Does Your VFM Speak Plant? The Botanical Grammar of Vision Foundation Models for Object Detection","display_name":"Does Your VFM Speak Plant? The Botanical Grammar of Vision Foundation Models for Object Detection","publication_year":2026,"publication_date":"2026-04-10","ids":{"openalex":"https://openalex.org/W7154263945","doi":"https://doi.org/10.48550/arxiv.2604.09920"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.09920","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09920","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.09920","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133566521","display_name":"Lars Lundqvist","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lundqvist, Lars","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119248971","display_name":"Earl Ranario","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ranario, Earl","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089316511","display_name":"Hamid Kamangir","orcid":"https://orcid.org/0000-0001-9718-7518"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kamangir, Hamid","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048969131","display_name":"Heesup Yun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yun, Heesup","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133608265","display_name":"Christine Diepenbrock","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Diepenbrock, Christine","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008008876","display_name":"Brian N. Bailey","orcid":"https://orcid.org/0000-0003-1919-2324"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bailey, Brian N.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133587295","display_name":"J. Mason Earles","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Earles, J. Mason","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10616","display_name":"Smart Agriculture and AI","score":0.4235000014305115,"subfield":{"id":"https://openalex.org/subfields/1110","display_name":"Plant Science"},"field":{"id":"https://openalex.org/fields/11","display_name":"Agricultural and Biological Sciences"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10616","display_name":"Smart Agriculture and AI","score":0.4235000014305115,"subfield":{"id":"https://openalex.org/subfields/1110","display_name":"Plant Science"},"field":{"id":"https://openalex.org/fields/11","display_name":"Agricultural and Biological Sciences"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.32910001277923584,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.013299999758601189,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.6679999828338623},{"id":"https://openalex.org/keywords/synthetic-data","display_name":"Synthetic data","score":0.5055000185966492},{"id":"https://openalex.org/keywords/grammar","display_name":"Grammar","score":0.4503999948501587},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.43970000743865967},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.43130001425743103},{"id":"https://openalex.org/keywords/detector","display_name":"Detector","score":0.42829999327659607},{"id":"https://openalex.org/keywords/foundation","display_name":"Foundation (evidence)","score":0.38179999589920044}],"concepts":[{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.6679999828338623},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6406999826431274},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.63919997215271},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.5055000185966492},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4740999937057495},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4641000032424927},{"id":"https://openalex.org/C26022165","wikidata":"https://www.wikidata.org/wiki/Q8091","display_name":"Grammar","level":2,"score":0.4503999948501587},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.43970000743865967},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.43130001425743103},{"id":"https://openalex.org/C94915269","wikidata":"https://www.wikidata.org/wiki/Q1834857","display_name":"Detector","level":2,"score":0.42829999327659607},{"id":"https://openalex.org/C2780966255","wikidata":"https://www.wikidata.org/wiki/Q5474306","display_name":"Foundation (evidence)","level":2,"score":0.38179999589920044},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.31150001287460327},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2856999933719635},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2782999873161316},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2766999900341034},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.27480000257492065},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.27140000462532043},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.2614000141620636},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.25929999351501465},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.25839999318122864}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.09920","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09920","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.09920","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.09920","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"display_name":"Zero hunger","score":0.5693218111991882,"id":"https://metadata.un.org/sdg/2"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision":[0],"foundation":[1],"models":[2,77],"(VFMs)":[3],"offer":[4],"the":[5,129,147,180,207],"promise":[6],"of":[7,182],"zero-shot":[8,210],"object":[9],"detection":[10,54],"without":[11,215],"task-specific":[12],"training":[13],"data,":[14],"yet":[15],"their":[16],"performance":[17],"in":[18],"complex":[19],"agricultural":[20],"scenes":[21],"remains":[22],"highly":[23],"sensitive":[24],"to":[25,80,127,133,165],"text":[26],"prompt":[27,33,81,156,202],"construction.":[28],"We":[29,61],"present":[30],"a":[31,99,134],"systematic":[32],"optimization":[34],"framework":[35],"evaluating":[36],"four":[37],"open-vocabulary":[38],"detectors":[39,214],"--":[40,48,138,141],"YOLO":[41,107,190],"World,":[42],"SAM3,":[43],"Grounding":[44],"DINO,":[45],"and":[46,52,57,67,109,142,212,219,226],"OWLv2":[47,113],"for":[49,106,112,179,189,196],"cowpea":[50,116,139],"flower":[51,117,153],"pod":[53],"across":[55,228],"synthetic":[56,115,152,161],"real":[58,177],"field":[59],"imagery.":[60],"decompose":[62],"prompts":[63,94,169,222],"into":[64],"eight":[65],"axes":[66],"conduct":[68],"one-factor-at-a-time":[69],"analysis":[70],"followed":[71],"by":[72],"combinatorial":[73,93],"optimization,":[74],"revealing":[75],"that":[76,84,201,220],"respond":[78],"divergently":[79],"structure:":[82],"conditions":[83],"optimize":[85],"one":[86],"architecture":[87],"can":[88,204],"collapse":[89],"another.":[90],"Applying":[91],"model-specific":[92],"yields":[95],"substantial":[96],"gains":[97],"over":[98],"naive":[100],"species-name":[101],"baseline,":[102],"including":[103],"+0.357":[104],"mAP@0.5":[105,111],"World":[108],"+0.362":[110],"on":[114,160,175],"data.":[118,154],"To":[119],"evaluate":[120],"cross-task":[121],"generalization,":[122],"we":[123],"use":[124],"an":[125],"LLM":[126],"translate":[128],"discovered":[130,148,174],"axis":[131],"structure":[132],"morphologically":[135],"distinct":[136],"target":[137],"pods":[140],"compare":[143],"against":[144],"prompting":[145],"using":[146],"optimal":[149,221],"structures":[150,157],"from":[151],"Crucially,":[155],"optimized":[158],"exclusively":[159],"data":[162,178],"transfer":[163],"effectively":[164],"real-world":[166],"fields:":[167],"synthetic-pipeline":[168],"match":[170],"or":[171],"exceed":[172],"those":[173],"labeled":[176],"majority":[181],"model-object":[183],"combinations":[184],"(flower:":[185],"0.374":[186],"vs.":[187,194],"0.353":[188],"World;":[191],"pod:":[192],"0.429":[193],"0.371":[195],"SAM3).":[197],"Our":[198],"findings":[199],"demonstrate":[200],"engineering":[203],"substantially":[205],"close":[206],"gap":[208],"between":[209],"VFMs":[211],"supervised":[213],"requiring":[216],"manual":[217],"annotation,":[218],"are":[223],"model-specific,":[224],"non-obvious,":[225],"transferable":[227],"domains.":[229]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-04-15T00:00:00"}
