{"id":"https://openalex.org/W7164847835","doi":"https://doi.org/10.1145/3805622.3810755","title":"LaViSE: Language-aware Vision Scale Enhancement for Referring Remote Sensing Image Segmentation","display_name":"LaViSE: Language-aware Vision Scale Enhancement for Referring Remote Sensing Image Segmentation","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164847835","doi":"https://doi.org/10.1145/3805622.3810755"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810755","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810755","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810755","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5085035158","display_name":"Yan Li","orcid":"https://orcid.org/0000-0001-5564-5074"},"institutions":[{"id":"https://openalex.org/I55712492","display_name":"Zhejiang University of Technology","ror":"https://ror.org/02djqfd08","country_code":"CN","type":"education","lineage":["https://openalex.org/I55712492"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yan Li","raw_affiliation_strings":["Zhejiang University of Technology, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-5564-5074","affiliations":[{"raw_affiliation_string":"Zhejiang University of Technology, Hangzhou, China","institution_ids":["https://openalex.org/I55712492"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138658959","display_name":"Junjie Zheng","orcid":"https://orcid.org/0009-0003-8003-9107"},"institutions":[{"id":"https://openalex.org/I55712492","display_name":"Zhejiang University of Technology","ror":"https://ror.org/02djqfd08","country_code":"CN","type":"education","lineage":["https://openalex.org/I55712492"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junjie Zheng","raw_affiliation_strings":["Zhejiang University of Technology, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0003-8003-9107","affiliations":[{"raw_affiliation_string":"Zhejiang University of Technology, Hangzhou, China","institution_ids":["https://openalex.org/I55712492"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138667259","display_name":"Zhouchao Fu","orcid":"https://orcid.org/0009-0009-0082-541X"},"institutions":[{"id":"https://openalex.org/I55712492","display_name":"Zhejiang University of Technology","ror":"https://ror.org/02djqfd08","country_code":"CN","type":"education","lineage":["https://openalex.org/I55712492"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhouchao Fu","raw_affiliation_strings":["Zhejiang University of Technology, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0009-0082-541X","affiliations":[{"raw_affiliation_string":"Zhejiang University of Technology, Hangzhou, China","institution_ids":["https://openalex.org/I55712492"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070507519","display_name":"Shengjie Yang","orcid":"https://orcid.org/0009-0006-7174-5220"},"institutions":[{"id":"https://openalex.org/I55712492","display_name":"Zhejiang University of Technology","ror":"https://ror.org/02djqfd08","country_code":"CN","type":"education","lineage":["https://openalex.org/I55712492"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shengjie Yang","raw_affiliation_strings":["Zhejiang University of Technology, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0001-5759-6250","affiliations":[{"raw_affiliation_string":"Zhejiang University of Technology, Hangzhou, China","institution_ids":["https://openalex.org/I55712492"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138642695","display_name":"Junjie Liao","orcid":"https://orcid.org/0009-0006-8386-6601"},"institutions":[{"id":"https://openalex.org/I55712492","display_name":"Zhejiang University of Technology","ror":"https://ror.org/02djqfd08","country_code":"CN","type":"education","lineage":["https://openalex.org/I55712492"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junjie Liao","raw_affiliation_strings":["Zhejiang University of Technology, Hangzhou, China"],"raw_orcid":"https://orcid.org/0009-0006-8386-6601","affiliations":[{"raw_affiliation_string":"Zhejiang University of Technology, Hangzhou, China","institution_ids":["https://openalex.org/I55712492"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5026233608","display_name":"Jianwei Zheng","orcid":"https://orcid.org/0000-0001-6017-0552"},"institutions":[{"id":"https://openalex.org/I55712492","display_name":"Zhejiang University of Technology","ror":"https://ror.org/02djqfd08","country_code":"CN","type":"education","lineage":["https://openalex.org/I55712492"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianwei Zheng","raw_affiliation_strings":["Zhejiang University of Technology, Hangzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-6017-0552","affiliations":[{"raw_affiliation_string":"Zhejiang University of Technology, Hangzhou, China","institution_ids":["https://openalex.org/I55712492"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93869522,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"738","last_page":"747"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.5038999915122986,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.5038999915122986,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.4138000011444092,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.015399999916553497,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.670199990272522},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5601999759674072},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5008000135421753},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.4918999969959259},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.4887999892234802},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.4690999984741211},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.46129998564720154},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4424000084400177},{"id":"https://openalex.org/keywords/boundary","display_name":"Boundary (topology)","score":0.39750000834465027}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7508000135421753},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.670199990272522},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6603000164031982},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5845999717712402},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5601999759674072},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5008000135421753},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.4918999969959259},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.4887999892234802},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.4690999984741211},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.46129998564720154},{"id":"https://openalex.org/C62649853","wikidata":"https://www.wikidata.org/wiki/Q199687","display_name":"Remote sensing","level":1,"score":0.4474000036716461},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4424000084400177},{"id":"https://openalex.org/C62354387","wikidata":"https://www.wikidata.org/wiki/Q875399","display_name":"Boundary (topology)","level":2,"score":0.39750000834465027},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.39309999346733093},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.37689998745918274},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.3515999913215637},{"id":"https://openalex.org/C69744172","wikidata":"https://www.wikidata.org/wiki/Q860822","display_name":"Image fusion","level":3,"score":0.34700000286102295},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.34700000286102295},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3440000116825104},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.33480000495910645},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3075999915599823},{"id":"https://openalex.org/C125308379","wikidata":"https://www.wikidata.org/wiki/Q363057","display_name":"Market segmentation","level":2,"score":0.2888999879360199},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2881999909877777},{"id":"https://openalex.org/C183365957","wikidata":"https://www.wikidata.org/wiki/Q17140402","display_name":"Remote sensing application","level":3,"score":0.2782000005245209},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.2619999945163727}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810755","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810755","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810755","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810755","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W2108598243","https://openalex.org/W2194775991","https://openalex.org/W2255466643","https://openalex.org/W2302548814","https://openalex.org/W2605127024","https://openalex.org/W2798556392","https://openalex.org/W2964284374","https://openalex.org/W2964309882","https://openalex.org/W2964345792","https://openalex.org/W2979826702","https://openalex.org/W2980088508","https://openalex.org/W3034325957","https://openalex.org/W3034692043","https://openalex.org/W3096609285","https://openalex.org/W3108748824","https://openalex.org/W3138516171","https://openalex.org/W3216551675","https://openalex.org/W4200631575","https://openalex.org/W4307504011","https://openalex.org/W4386071687","https://openalex.org/W4390873528","https://openalex.org/W4390874575","https://openalex.org/W4392397297","https://openalex.org/W4402727260","https://openalex.org/W4403780754","https://openalex.org/W4405787134","https://openalex.org/W4410842412","https://openalex.org/W4411019933","https://openalex.org/W4411171980","https://openalex.org/W4416728728"],"related_works":[],"abstract_inverted_index":{"Referring":[0],"Remote":[1],"Sensing":[2],"Image":[3],"Segmentation":[4],"(RRSIS)":[5],"aims":[6],"to":[7],"segment":[8],"target":[9],"objects":[10],"in":[11,139,161],"aerial":[12],"imagery":[13],"based":[14],"on":[15,145],"natural":[16],"language":[17,47],"expressions.":[18],"Although":[19],"recent":[20],"multi-scale":[21],"feature":[22,125],"aggregation":[23],"methods":[24],"have":[25],"improved":[26],"cross-modal":[27,76],"alignment,":[28],"and":[29,37,46,92,113,116,123,136,150,166,172],"existing":[30],"approaches":[31],"still":[32],"struggle":[33],"with":[34,96,164],"accurate":[35],"localization":[36,84],"segmentation":[38],"across":[39,127],"scales":[40,45,80],"because":[41],"interactions":[42],"between":[43],"visual":[44,90,94],"are":[48],"not":[49],"sufficiently":[50],"modeled.":[51],"To":[52],"address":[53],"these":[54],"challenges,":[55],"we":[56],"propose":[57],"a":[58],"SAM-based":[59],"framework":[60],"termed":[61],"LaViSE,":[62],"which":[63,129],"incorporates":[64],"two":[65],"key":[66],"modules":[67],":":[68],"the":[69,99,146],"Language-Guided":[70],"Hierarchical":[71],"Fusion":[72],"(LGHF)":[73],"module":[74,104],"integrates":[75],"features":[77,95,108],"at":[78],"multiple":[79],"for":[81,132],"precise":[82],"object":[83],"by":[85],"injecting":[86],"spatial":[87,111],"coordinates":[88],"into":[89],"representations":[91],"combining":[93],"aligned":[97],"features;":[98],"Language-Attentive":[100],"Scale-Unified":[101],"Aggregation":[102],"(LASA)":[103],"globally":[105],"merges":[106],"multi-level":[107],"while":[109],"maintaining":[110],"consistency":[112],"boundary":[114],"fidelity,":[115],"further":[117],"preserves":[118],"fine-grained":[119],"structural":[120],"details":[121],"through":[122],"language-conditioned":[124],"recalibration":[126],"scales,":[128],"is":[130],"crucial":[131],"segmenting":[133],"densely":[134,167],"distributed":[135,168],"scale-varying":[137],"targets":[138],"complex":[140],"remote":[141],"sensing":[142],"imagery.":[143],"Experiments":[144],"widely":[147],"used":[148],"RefSegRS":[149],"RRSIS-D":[151],"benchmarks":[152],"demonstrate":[153],"that":[154],"LaViSE":[155],"consistently":[156],"outperforms":[157],"state-of-the-art":[158],"methods,":[159],"particularly":[160],"challenging":[162],"scenarios":[163],"small":[165],"objects.":[169],"Our":[170],"code":[171],"pre-trained":[173],"models":[174],"will":[175],"be":[176],"released":[177],"upon":[178],"publication.":[179]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
