{"id":"https://openalex.org/W7164814741","doi":"https://doi.org/10.1145/3805622.3810666","title":"ICMF-Net: Interactive Cross-Modal Fusion with Attention and Selection Network for Remote Sensing Object Detection","display_name":"ICMF-Net: Interactive Cross-Modal Fusion with Attention and Selection Network for Remote Sensing Object Detection","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164814741","doi":"https://doi.org/10.1145/3805622.3810666"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810666","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810666","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810666","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5138636439","display_name":"Kun Yao","orcid":"https://orcid.org/0009-0009-1634-7539"},"institutions":[{"id":"https://openalex.org/I2722730","display_name":"Inner Mongolia University","ror":"https://ror.org/0106qb496","country_code":"CN","type":"education","lineage":["https://openalex.org/I2722730"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kun Yao","raw_affiliation_strings":["School of Computer Science, Inner Mongolia University, Hohhot, Inner Mongolia, China"],"raw_orcid":"https://orcid.org/0009-0009-1634-7539","affiliations":[{"raw_affiliation_string":"School of Computer Science, Inner Mongolia University, Hohhot, Inner Mongolia, China","institution_ids":["https://openalex.org/I2722730"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049225588","display_name":"Hao Zeng","orcid":"https://orcid.org/0000-0002-1831-3019"},"institutions":[{"id":"https://openalex.org/I2722730","display_name":"Inner Mongolia University","ror":"https://ror.org/0106qb496","country_code":"CN","type":"education","lineage":["https://openalex.org/I2722730"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Zeng","raw_affiliation_strings":["School of Computer Science, Inner Mongolia University, Hohhot, Inner Mongolia, China"],"raw_orcid":"https://orcid.org/0009-0000-3232-3742","affiliations":[{"raw_affiliation_string":"School of Computer Science, Inner Mongolia University, Hohhot, Inner Mongolia, China","institution_ids":["https://openalex.org/I2722730"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108050989","display_name":"Yuxing Wang","orcid":"https://orcid.org/0000-0001-9156-0377"},"institutions":[{"id":"https://openalex.org/I2722730","display_name":"Inner Mongolia University","ror":"https://ror.org/0106qb496","country_code":"CN","type":"education","lineage":["https://openalex.org/I2722730"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yida Wang","raw_affiliation_strings":["School of Computer Science, Inner Mongolia University, Hohhot, Inner Mongolia, China"],"raw_orcid":"https://orcid.org/0009-0005-8785-6470","affiliations":[{"raw_affiliation_string":"School of Computer Science, Inner Mongolia University, Hohhot, Inner Mongolia, China","institution_ids":["https://openalex.org/I2722730"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5032990372","display_name":"Ming Ma","orcid":"https://orcid.org/0000-0002-2060-2266"},"institutions":[{"id":"https://openalex.org/I2722730","display_name":"Inner Mongolia University","ror":"https://ror.org/0106qb496","country_code":"CN","type":"education","lineage":["https://openalex.org/I2722730"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ming Ma","raw_affiliation_strings":["School of Computer Science, Inner Mongolia University, Hohhot, Inner Mongolia, China"],"raw_orcid":"https://orcid.org/0000-0002-2060-2266","affiliations":[{"raw_affiliation_string":"School of Computer Science, Inner Mongolia University, Hohhot, Inner Mongolia, China","institution_ids":["https://openalex.org/I2722730"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93462841,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1442","last_page":"1449"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.7027000188827515,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.7027000188827515,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10689","display_name":"Remote-Sensing Image Classification","score":0.1647000014781952,"subfield":{"id":"https://openalex.org/subfields/2214","display_name":"Media Technology"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11659","display_name":"Advanced Image Fusion Techniques","score":0.044599998742341995,"subfield":{"id":"https://openalex.org/subfields/2214","display_name":"Media Technology"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.5156000256538391},{"id":"https://openalex.org/keywords/multispectral-image","display_name":"Multispectral image","score":0.4912000000476837},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.4844000041484833},{"id":"https://openalex.org/keywords/feature-selection","display_name":"Feature selection","score":0.4555000066757202},{"id":"https://openalex.org/keywords/rgb-color-model","display_name":"RGB color model","score":0.4081999957561493},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.40529999136924744},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.39259999990463257},{"id":"https://openalex.org/keywords/remote-sensing-application","display_name":"Remote sensing application","score":0.3840999901294708},{"id":"https://openalex.org/keywords/sensor-fusion","display_name":"Sensor fusion","score":0.3840999901294708},{"id":"https://openalex.org/keywords/spatial-analysis","display_name":"Spatial analysis","score":0.3831999897956848}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7267000079154968},{"id":"https://openalex.org/C62649853","wikidata":"https://www.wikidata.org/wiki/Q199687","display_name":"Remote sensing","level":1,"score":0.6539999842643738},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5478000044822693},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.5156000256538391},{"id":"https://openalex.org/C173163844","wikidata":"https://www.wikidata.org/wiki/Q1761440","display_name":"Multispectral image","level":2,"score":0.4912000000476837},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.4844000041484833},{"id":"https://openalex.org/C148483581","wikidata":"https://www.wikidata.org/wiki/Q446488","display_name":"Feature selection","level":2,"score":0.4555000066757202},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.43810001015663147},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.4081999957561493},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.40529999136924744},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.39259999990463257},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.3840999901294708},{"id":"https://openalex.org/C183365957","wikidata":"https://www.wikidata.org/wiki/Q17140402","display_name":"Remote sensing application","level":3,"score":0.3840999901294708},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.3831999897956848},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3346000015735626},{"id":"https://openalex.org/C159078339","wikidata":"https://www.wikidata.org/wiki/Q959005","display_name":"Hyperspectral imaging","level":2,"score":0.329800009727478},{"id":"https://openalex.org/C202269582","wikidata":"https://www.wikidata.org/wiki/Q2644277","display_name":"Complementarity (molecular biology)","level":2,"score":0.3248000144958496},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.32260000705718994},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.3165000081062317},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.31060001254081726},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.30799999833106995},{"id":"https://openalex.org/C110384440","wikidata":"https://www.wikidata.org/wiki/Q1143270","display_name":"Upsampling","level":3,"score":0.3043999969959259},{"id":"https://openalex.org/C39399123","wikidata":"https://www.wikidata.org/wiki/Q1348989","display_name":"Earth observation","level":3,"score":0.30000001192092896},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2994999885559082},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2992999851703644},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.287200003862381},{"id":"https://openalex.org/C21200559","wikidata":"https://www.wikidata.org/wiki/Q7451068","display_name":"Sensitivity (control systems)","level":2,"score":0.28450000286102295},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.2816999852657318},{"id":"https://openalex.org/C104541649","wikidata":"https://www.wikidata.org/wiki/Q6935090","display_name":"Multispectral pattern recognition","level":3,"score":0.2768999934196472},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.27320000529289246},{"id":"https://openalex.org/C3019973339","wikidata":"https://www.wikidata.org/wiki/Q899523","display_name":"Object based","level":3,"score":0.2655999958515167},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26510000228881836}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810666","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810666","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810666","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810666","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.8458445072174072,"id":"https://metadata.un.org/sdg/11","display_name":"Sustainable cities and communities"}],"awards":[{"id":"https://openalex.org/G5391925069","display_name":null,"funder_award_id":"62462049","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W2193145675","https://openalex.org/W2606788270","https://openalex.org/W2752782242","https://openalex.org/W2884585870","https://openalex.org/W2963037989","https://openalex.org/W3034552520","https://openalex.org/W3108601100","https://openalex.org/W4200142374","https://openalex.org/W4318833867","https://openalex.org/W4321021845","https://openalex.org/W4327785494","https://openalex.org/W4386071462","https://openalex.org/W4386189887","https://openalex.org/W4387618979","https://openalex.org/W4387828818","https://openalex.org/W4393171245","https://openalex.org/W4402569181","https://openalex.org/W4415537266"],"related_works":[],"abstract_inverted_index":{"Remote":[0],"sensing":[1,79,184],"object":[2,80,185],"detection":[3,186],"plays":[4],"a":[5,178],"critical":[6],"role":[7],"in":[8,134],"applications":[9],"such":[10],"as":[11,177],"ecological":[12],"monitoring,":[13],"mining":[14],"management,":[15],"and":[16,31,71,91,104,111,122,125,137,143,148,158,169,180],"urban":[17],"planning,":[18],"yet":[19],"remains":[20],"challenging":[21],"due":[22],"to":[23,66,139,146],"significant":[24],"scale":[25],"variations,":[26],"densely":[27,149],"distributed":[28,150],"small":[29,147],"objects,":[30],"complex":[32],"background":[33],"interference.":[34],"Previous":[35],"works":[36],"primarily":[37],"rely":[38],"on":[39,154],"RGB-only":[40],"or":[41],"multispectral":[42],"imagery,":[43],"often":[44],"neglecting":[45],"the":[46,63,135,155,172],"DSM":[47],"data":[48,76],"that":[49,162],"encodes":[50],"valuable":[51],"height":[52],"information.":[53],"To":[54],"address":[55],"these":[56],"limitations,":[57],"we":[58],"propose":[59],"ICMF-Net,":[60],"one":[61],"of":[62,84],"first":[64],"studies":[65],"leverage":[67],"both":[68],"RGB":[69],"imagery":[70],"Digital":[72],"Surface":[73],"Model":[74],"(DSM)":[75],"for":[77],"remote":[78,183],"detection.":[81],"ICMF-Net":[82,163],"consists":[83],"three":[85],"key":[86],"components:":[87],"CMASF":[88],"(Cross-Modal":[89],"Attention":[90],"Selection":[92],"Fusion),":[93],"which":[94,114,129],"enhances":[95],"modality":[96],"complementarity":[97],"via":[98],"adaptive":[99],"cross-modal":[100],"attention,":[101],"feature":[102,117],"selection,":[103],"height-aware":[105],"spatial":[106],"enhancement;":[107],"FEMR":[108],"(Feature":[109],"Extraction":[110],"Multi-scale":[112],"Refinement),":[113],"strengthens":[115],"hierarchical":[116],"representation":[118],"through":[119],"multi-scale":[120],"attention":[121],"redundancy-aware":[123],"convolution;":[124],"SPD-Conv":[126],"(Space-to-Depth":[127],"Convolution),":[128],"replaces":[130],"standard":[131],"strided":[132],"convolutions":[133],"backbone":[136],"neck":[138],"enable":[140],"information-preserving":[141],"downsampling":[142],"improve":[144],"sensitivity":[145],"targets.":[151],"Extensive":[152],"experiments":[153],"ISPRS":[156],"Potsdam":[157],"MMOD":[159],"benchmarks":[160],"demonstrate":[161],"improves":[164],"mAP50:":[165],"95":[166],"by":[167],"5.2%":[168],"4.3%":[170],"over":[171],"baseline,":[173],"respectively,":[174],"establishing":[175],"it":[176],"robust":[179],"effective":[181],"RGB\u2013DSM":[182],"framework.":[187]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
