{"id":"https://openalex.org/W7151739330","doi":"https://doi.org/10.48550/arxiv.2604.06124","title":"Lightweight Multimodal Adaptation of Vision Language Models for Species Recognition and Habitat Context Interpretation in Drone Thermal Imagery","display_name":"Lightweight Multimodal Adaptation of Vision Language Models for Species Recognition and Habitat Context Interpretation in Drone Thermal Imagery","publication_year":2026,"publication_date":"2026-04-07","ids":{"openalex":"https://openalex.org/W7151739330","doi":"https://doi.org/10.48550/arxiv.2604.06124"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.06124","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06124","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.06124","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133176483","display_name":"Hao Chen","orcid":"https://orcid.org/0009-0003-0975-6197"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Hao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133215733","display_name":"Fang Qiu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiu, Fang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5094162574","display_name":"Fangchao Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Fangchao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108679454","display_name":"Defei Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Defei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011726734","display_name":"Eve Bohnett","orcid":"https://orcid.org/0000-0002-1870-8897"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bohnett, Eve","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133175309","display_name":"Li An","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"An, Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11133","display_name":"UAV Applications and Optimization","score":0.17810000479221344,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11133","display_name":"UAV Applications and Optimization","score":0.17810000479221344,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11211","display_name":"3D Surveying and Cultural Heritage","score":0.13779999315738678,"subfield":{"id":"https://openalex.org/subfields/1907","display_name":"Geology"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10199","display_name":"Wildlife Ecology and Conservation","score":0.09269999712705612,"subfield":{"id":"https://openalex.org/subfields/2303","display_name":"Ecology"},"field":{"id":"https://openalex.org/fields/23","display_name":"Environmental Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/drone","display_name":"Drone","score":0.6858000159263611},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6342999935150146},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5522000193595886},{"id":"https://openalex.org/keywords/adaptation","display_name":"Adaptation (eye)","score":0.5475999712944031},{"id":"https://openalex.org/keywords/rgb-color-model","display_name":"RGB color model","score":0.5001000165939331},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.40529999136924744},{"id":"https://openalex.org/keywords/thermal-infrared","display_name":"Thermal infrared","score":0.3912000060081482},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3887999951839447},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.3862999975681305}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6956999897956848},{"id":"https://openalex.org/C59519942","wikidata":"https://www.wikidata.org/wiki/Q650665","display_name":"Drone","level":2,"score":0.6858000159263611},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6342999935150146},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.597000002861023},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5522000193595886},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.5475999712944031},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.5001000165939331},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.44620001316070557},{"id":"https://openalex.org/C62649853","wikidata":"https://www.wikidata.org/wiki/Q199687","display_name":"Remote sensing","level":1,"score":0.4146000146865845},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.40529999136924744},{"id":"https://openalex.org/C2984335091","wikidata":"https://www.wikidata.org/wiki/Q11388","display_name":"Thermal infrared","level":3,"score":0.3912000060081482},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3887999951839447},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.3862999975681305},{"id":"https://openalex.org/C527412718","wikidata":"https://www.wikidata.org/wiki/Q855395","display_name":"Interpretation (philosophy)","level":2,"score":0.3813000023365021},{"id":"https://openalex.org/C39399123","wikidata":"https://www.wikidata.org/wiki/Q1348989","display_name":"Earth observation","level":3,"score":0.37959998846054077},{"id":"https://openalex.org/C2778102629","wikidata":"https://www.wikidata.org/wiki/Q725252","display_name":"Satellite imagery","level":2,"score":0.3741999864578247},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35690000653266907},{"id":"https://openalex.org/C2776865275","wikidata":"https://www.wikidata.org/wiki/Q311666","display_name":"Projector","level":2,"score":0.3544999957084656},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.3463999927043915},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.31279999017715454},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.3107999861240387},{"id":"https://openalex.org/C4051589","wikidata":"https://www.wikidata.org/wiki/Q860959","display_name":"Mental image","level":3,"score":0.2953999936580658},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.26980000734329224},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.2599000036716461},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.25859999656677246},{"id":"https://openalex.org/C161840515","wikidata":"https://www.wikidata.org/wiki/Q186131","display_name":"Terrain","level":2,"score":0.25540000200271606},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.2551000118255615},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.25189998745918274},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.25119999051094055},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.25049999356269836},{"id":"https://openalex.org/C2987819851","wikidata":"https://www.wikidata.org/wiki/Q191839","display_name":"Aerial imagery","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.06124","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06124","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.06124","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06124","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/15","score":0.7129525542259216,"display_name":"Life in Land"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"This":[0],"study":[1],"proposes":[2],"a":[3,26],"lightweight":[4,153],"multimodal":[5,45],"adaptation":[6,155],"framework":[7],"to":[8,41,57,134,166,176],"bridge":[9],"the":[10,49,85,93,132,149],"representation":[11],"gap":[12],"between":[13],"RGB-pretrained":[14,164],"VLMs":[15,43,165],"and":[16,20,38,67,74,81,107,111,118,144,159],"thermal":[17,31,58,124,167],"infrared":[18],"imagery,":[19,169],"demonstrates":[21],"its":[22],"practical":[23,160],"utility":[24,172],"using":[25],"real":[27],"drone-collected":[28,36],"dataset.":[29],"A":[30],"dataset":[32],"was":[33,39],"developed":[34],"from":[35,53,173],"imagery":[37,125,130],"used":[40],"fine-tune":[42],"through":[44],"projector":[46],"alignment,":[47],"enabling":[48],"transfer":[50],"of":[51,100,115],"information":[52],"RGB-based":[54],"visual":[55],"representations":[56],"radiometric":[59],"inputs.":[60],"Three":[61],"representative":[62],"models,":[63,87],"including":[64,138],"InternVL3-8B-Instruct,":[65],"Qwen2.5-VL-7B-Instruct,":[66],"Qwen3-VL-8B-Instruct,":[68],"were":[69],"benchmarked":[70],"under":[71],"both":[72],"closed-set":[73],"open-set":[75,90],"prompting":[76,91],"conditions":[77],"for":[78,102,105,109,162],"species":[79],"recognition":[80,175],"instance":[82],"enumeration.":[83],"Among":[84],"tested":[86],"Qwen3-VL-8B-Instruct":[88],"with":[89,97,126],"achieved":[92],"best":[94],"overall":[95],"performance,":[96],"F1":[98],"scores":[99],"0.935":[101],"deer,":[103],"0.915":[104],"rhino,":[106],"0.968":[108],"elephant,":[110],"within-1":[112],"enumeration":[113],"accuracies":[114],"0.779,":[116],"0.982,":[117],"1.000,":[119],"respectively.":[120],"In":[121],"addition,":[122],"combining":[123],"simultaneously":[127],"collected":[128],"RGB":[129],"enabled":[131],"model":[133],"generate":[135],"habitat-context":[136,177],"information,":[137],"land-cover":[139],"characteristics,":[140],"key":[141],"landscape":[142],"features,":[143],"visible":[145],"human":[146],"disturbance.":[147],"Overall,":[148],"findings":[150],"demonstrate":[151],"that":[152],"projector-based":[154],"provides":[156],"an":[157],"effective":[158],"route":[161],"transferring":[163],"drone":[168],"expanding":[170],"their":[171],"object-level":[174],"interpretation":[178],"in":[179],"ecological":[180],"monitoring.":[181]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-09T00:00:00"}
