{"id":"https://openalex.org/W4417252141","doi":"https://doi.org/10.1109/tmm.2025.3642913","title":"Seeing With Words: Interpretable Language-Guided Drone Geo-Localization via LLM-Enriched Semantic Attribute Alignment","display_name":"Seeing With Words: Interpretable Language-Guided Drone Geo-Localization via LLM-Enriched Semantic Attribute Alignment","publication_year":2025,"publication_date":"2025-12-11","ids":{"openalex":"https://openalex.org/W4417252141","doi":"https://doi.org/10.1109/tmm.2025.3642913"},"language":null,"primary_location":{"id":"doi:10.1109/tmm.2025.3642913","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3642913","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5076922285","display_name":"Changsen Yuan","orcid":"https://orcid.org/0000-0001-5802-7503"},"institutions":[{"id":"https://openalex.org/I37796252","display_name":"Beijing University of Technology","ror":"https://ror.org/037b1pp87","country_code":"CN","type":"education","lineage":["https://openalex.org/I37796252"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Changsen Yuan","raw_affiliation_strings":["Beijing University of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing University of Technology, Beijing, China","institution_ids":["https://openalex.org/I37796252"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068136447","display_name":"Yanghao Zhou","orcid":"https://orcid.org/0009-0001-9401-4432"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yang-Hao Zhou","raw_affiliation_strings":["Beijing Institute of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063010018","display_name":"Cunhan Guo","orcid":"https://orcid.org/0000-0002-4546-3928"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Cunhan Guo","raw_affiliation_strings":["Beijing Institute of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014227817","display_name":"Danjie Han","orcid":null},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Danjie Han","raw_affiliation_strings":["Nanjing University of Science and Technology, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University of Science and Technology, Nanjing, China","institution_ids":["https://openalex.org/I36399199"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101971395","display_name":"Ge Shi","orcid":"https://orcid.org/0000-0002-9296-9905"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ge Shi","raw_affiliation_strings":["Beijing Institute of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100676721","display_name":"Wenwu Wang","orcid":"https://orcid.org/0000-0002-8393-5703"},"institutions":[{"id":"https://openalex.org/I28290843","display_name":"University of Surrey","ror":"https://ror.org/00ks66431","country_code":"GB","type":"education","lineage":["https://openalex.org/I28290843"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Wenwu Wang","raw_affiliation_strings":["School of Computer Science and Electronic Engineering, University of Surrey, Guildford, U.K"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Electronic Engineering, University of Surrey, Guildford, U.K","institution_ids":["https://openalex.org/I28290843"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5076922285"],"corresponding_institution_ids":["https://openalex.org/I37796252"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.41181573,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"28","issue":null,"first_page":"2132","last_page":"2144"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8271999955177307,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8271999955177307,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.048500001430511475,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.02800000086426735,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/interpretability","display_name":"Interpretability","score":0.8184999823570251},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.621999979019165},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5205000042915344},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5077999830245972},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.41530001163482666},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.40560001134872437},{"id":"https://openalex.org/keywords/prior-probability","display_name":"Prior probability","score":0.367000013589859},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.3578000068664551},{"id":"https://openalex.org/keywords/boosting","display_name":"Boosting (machine learning)","score":0.3513000011444092}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8736000061035156},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.8184999823570251},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6708999872207642},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.621999979019165},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5353999733924866},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5205000042915344},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5077999830245972},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.41530001163482666},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.40560001134872437},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.367000013589859},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3578000068664551},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3522999882698059},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.3513000011444092},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3472000062465668},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.34540000557899475},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.33809998631477356},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3370000123977661},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.3301999866962433},{"id":"https://openalex.org/C59519942","wikidata":"https://www.wikidata.org/wiki/Q650665","display_name":"Drone","level":2,"score":0.3240000009536743},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.29660001397132874},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.29010000824928284},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.2831000089645386},{"id":"https://openalex.org/C25810664","wikidata":"https://www.wikidata.org/wiki/Q44325","display_name":"Ontology","level":2,"score":0.27720001339912415},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2752000093460083},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.2727999985218048},{"id":"https://openalex.org/C86034646","wikidata":"https://www.wikidata.org/wiki/Q474311","display_name":"Semantic gap","level":4,"score":0.27250000834465027},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.2540000081062317},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.25119999051094055}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2025.3642913","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3642913","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Natural":[0],"language-guided":[1,86],"drone":[2],"geo-localization":[3,211],"(DGL)":[4],"provides":[5],"an":[6],"intuitive":[7],"and":[8,20,32,47,54,69,126,141,155,167,175,182,197,203],"scalable":[9],"mode":[10],"of":[11,164],"human-drone":[12],"interaction":[13],"for":[14,84,209],"tasks":[15,41],"such":[16],"as":[17,44,132],"search,":[18],"rescue,":[19],"surveillance.":[21],"Recent":[22],"Vision-Language":[23],"Models":[24],"(VLMs)":[25],"can":[26],"learn":[27],"semantic":[28,52,134],"correspondences":[29],"between":[30,153],"text":[31,125],"images":[33],"during":[34],"fine-tuning.":[35],"However,":[36],"their":[37],"performance":[38,196],"in":[39],"DGL":[40],"remains":[42],"constrained,":[43],"complex":[45,201],"instructions":[46],"cluttered":[48],"scenes":[49],"often":[50],"cause":[51],"dilution":[53],"granularity":[55],"mismatch,":[56],"leading":[57],"to":[58],"weak":[59],"cross-modal":[60,104,139,173],"alignment.":[61],"Consequently,":[62],"the":[63,108,143,188,216],"models":[64,98],"struggle":[65],"with":[66,95],"ambiguous":[67],"targets":[68],"suffer":[70],"from":[71,124],"reduced":[72],"localization":[73],"accuracy.":[74],"To":[75],"address":[76],"these":[77],"challenges,":[78],"we":[79],"propose":[80],"SAA-DGL,":[81],"a":[82],"framework":[83],"interpretable":[85],"Drone":[87],"Geo-Localization":[88],"that":[89,192],"enriches":[90],"Semantic":[91,111],"Attribute":[92,112],"Alignment":[93,146],"(SAA)":[94],"large":[96],"language":[97],"(LLMs).":[99],"It":[100],"introduces":[101],"two":[102],"parameter-free":[103],"fusion":[105,151],"modules:":[106],"(1)":[107],"LLM-driven":[109],"Cross-modal":[110],"Enrichment":[113],"(LCSAE)":[114],"module,":[115,148],"which":[116,149],"extracts":[117],"fine-grained":[118],"attributes":[119],"(e.g.,":[120],"color,":[121],"shape,":[122],"position)":[123],"embeds":[127],"them":[128],"into":[129],"visual":[130,154,166,202],"features":[131,157],"explicit":[133],"anchors,":[135],"producing":[136],"semantically":[137],"enriched":[138,165],"representations;":[140],"(2)":[142],"Bidirectional":[144],"Feature":[145],"(BFA)":[147],"builds":[150],"relationships":[152],"textual":[156,168],"via":[158],"similarity-driven":[159],"mechanisms,":[160],"enabling":[161],"effective":[162],"integration":[163],"information.":[169],"This":[170],"design":[171],"improves":[172],"consistency":[174],"interpretability":[176],"while":[177],"preserving":[178],"pretrained":[179],"alignment":[180],"priors":[181],"enhancing":[183],"training":[184],"stability.":[185],"Experiments":[186],"on":[187],"GeoText-1652":[189],"benchmark":[190],"show":[191],"SAA-DGL":[193],"achieves":[194],"state-of-the-art":[195],"strong":[198],"robustness":[199],"under":[200],"linguistic":[204],"disturbances,":[205],"validating":[206],"its":[207],"effectiveness":[208],"challenging":[210],"scenarios.":[212],"We":[213],"will":[214],"release":[215],"code.":[217]},"counts_by_year":[],"updated_date":"2026-03-26T06:05:38.182114","created_date":"2025-12-11T00:00:00"}
