{"id":"https://openalex.org/W7125909372","doi":"https://doi.org/10.1109/smc58881.2025.11343398","title":"Difference-Guided Modality Fusion Network for Multimodal Object Detection","display_name":"Difference-Guided Modality Fusion Network for Multimodal Object Detection","publication_year":2025,"publication_date":"2025-10-05","ids":{"openalex":"https://openalex.org/W7125909372","doi":"https://doi.org/10.1109/smc58881.2025.11343398"},"language":null,"primary_location":{"id":"doi:10.1109/smc58881.2025.11343398","is_oa":false,"landing_page_url":"https://doi.org/10.1109/smc58881.2025.11343398","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124136099","display_name":"Linxuan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Linxuan Li","raw_affiliation_strings":["Xi&#x2019;an Jiaotong University,National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics,Xi&#x2019;an,China,710049"],"affiliations":[{"raw_affiliation_string":"Xi&#x2019;an Jiaotong University,National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics,Xi&#x2019;an,China,710049","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121920846","display_name":"Meiqin Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Meiqin Liu","raw_affiliation_strings":["Xi&#x2019;an Jiaotong University,National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics,Xi&#x2019;an,China,710049"],"affiliations":[{"raw_affiliation_string":"Xi&#x2019;an Jiaotong University,National Key Laboratory of Human-Machine Hybrid Augmented Intelligence, National Engineering Research Center for Visual Information and Applications, and Institute of Artificial Intelligence and Robotics,Xi&#x2019;an,China,710049","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124129234","display_name":"Jian Lan","orcid":null},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jian Lan","raw_affiliation_strings":["Xi&#x2019;an Jiaotong University,School of Electronics and Information Engineering,Xi&#x2019;an,China,710049"],"affiliations":[{"raw_affiliation_string":"Xi&#x2019;an Jiaotong University,School of Electronics and Information Engineering,Xi&#x2019;an,China,710049","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124066312","display_name":"Shanling Dong","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shanling Dong","raw_affiliation_strings":["Zhejiang University,College of Electrical Engineering,Hangzhou,China,310027"],"affiliations":[{"raw_affiliation_string":"Zhejiang University,College of Electrical Engineering,Hangzhou,China,310027","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5124090857","display_name":"Zhunga Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhunga Liu","raw_affiliation_strings":["Northwestern Polytechnical University,School of Automation,Xi&#x2019;an,China,710072"],"affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University,School of Automation,Xi&#x2019;an,China,710072","institution_ids":["https://openalex.org/I17145004"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5124136099"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.71618344,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"716","last_page":"721"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.6421999931335449,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.6421999931335449,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12389","display_name":"Infrared Target Detection Methodologies","score":0.11599999666213989,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.05040000006556511,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.6477000117301941},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.631600022315979},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.5893999934196472},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.546500027179718},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5430999994277954},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5360000133514404},{"id":"https://openalex.org/keywords/sensor-fusion","display_name":"Sensor fusion","score":0.4952999949455261},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.490200012922287},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.4821000099182129}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7566999793052673},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7506999969482422},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.6477000117301941},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.631600022315979},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.5893999934196472},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.546500027179718},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5430999994277954},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5360000133514404},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.4952999949455261},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.490200012922287},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.4821000099182129},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.46050000190734863},{"id":"https://openalex.org/C173414695","wikidata":"https://www.wikidata.org/wiki/Q5510276","display_name":"Fusion mechanism","level":4,"score":0.45509999990463257},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.4512999951839447},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.43209999799728394},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.41440001130104065},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3756999969482422},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.33649998903274536},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.31679999828338623},{"id":"https://openalex.org/C69744172","wikidata":"https://www.wikidata.org/wiki/Q860822","display_name":"Image fusion","level":3,"score":0.3156000077724457},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.302700012922287},{"id":"https://openalex.org/C2780735816","wikidata":"https://www.wikidata.org/wiki/Q28324931","display_name":"Incremental learning","level":2,"score":0.2980000078678131},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.2962999939918518},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.287200003862381},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.266400009393692},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.2662000060081482},{"id":"https://openalex.org/C93226319","wikidata":"https://www.wikidata.org/wiki/Q193137","display_name":"Differential (mechanical device)","level":2,"score":0.26600000262260437},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/smc58881.2025.11343398","is_oa":false,"landing_page_url":"https://doi.org/10.1109/smc58881.2025.11343398","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2056716515","https://openalex.org/W2955058313","https://openalex.org/W3034421924","https://openalex.org/W3036931590","https://openalex.org/W3207919963","https://openalex.org/W3213472242","https://openalex.org/W4312594135","https://openalex.org/W4312750622","https://openalex.org/W4313007055","https://openalex.org/W4313267411","https://openalex.org/W4327785494","https://openalex.org/W4385801610","https://openalex.org/W4386076504","https://openalex.org/W4386189887","https://openalex.org/W4387968049","https://openalex.org/W4391849436","https://openalex.org/W4394827148","https://openalex.org/W4395447471","https://openalex.org/W4398765815","https://openalex.org/W4401567633","https://openalex.org/W4402915974","https://openalex.org/W4402916146","https://openalex.org/W4403299787"],"related_works":[],"abstract_inverted_index":{"In":[0],"recent":[1],"years,":[2],"visible-infrared":[3],"object":[4],"detection":[5,46],"has":[6],"achieved":[7],"significant":[8],"progress.":[9],"However,":[10],"most":[11],"existing":[12],"methods":[13],"primarily":[14],"emphasize":[15],"the":[16,20,34,43,60,67,99,103,124,130,133,140,153,156],"shared":[17],"features":[18,101,138],"between":[19,78,139],"two":[21,104],"modalities":[22],"while":[23],"overlooking":[24],"their":[25],"feature":[26,76],"differences.":[27],"To":[28,71],"address":[29],"this":[30],"limitation,":[31],"we":[32,51,80,107],"propose":[33,52],"Difference-Guided":[35],"Modality":[36],"Fusion":[37],"Network,":[38],"which":[39],"can":[40,120],"effectively":[41],"improve":[42],"fusion":[44,85],"and":[45,74,97,126,147,164],"performance":[47],"of":[48,62,102,129,155],"modalities.":[49,105,141],"Specifically,":[50],"a":[53,82,89,109,116],"cross-modal":[54],"data":[55],"augmentation":[56],"strategy":[57],"to":[58,135],"overcome":[59],"limitations":[61],"single-modality":[63],"reliance":[64],"by":[65],"exchanging":[66],"partial":[68],"modal":[69,93],"information.":[70],"further":[72],"capture":[73],"analyze":[75],"differences":[77,125],"modalities,":[79,131],"introduce":[81],"differential":[83],"attention":[84],"approach":[86],"that":[87,114,119],"models":[88],"difference":[90],"matrix":[91],"across":[92],"channels,":[94],"thereby":[95],"quantifying":[96],"strengthening":[98],"salient":[100],"Additionally,":[106],"develop":[108],"modality-aware":[110],"dynamic":[111],"learning":[112],"mechanism":[113],"employs":[115],"loss":[117],"function":[118],"simultaneously":[121],"focus":[122],"on":[123,144],"common":[127],"parts":[128],"guiding":[132],"model":[134],"adaptively":[136],"learn":[137],"Experimental":[142],"results":[143],"FLIR,":[145],"LLVIP":[146],"M<sup":[148],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[149],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">3</sup>FD":[150],"datasets":[151],"demonstrate":[152],"effectiveness":[154],"proposed":[157],"method,":[158],"with":[159],"mAP":[160],"reaching":[161],"42.3%,":[162],"67.5%":[163],"59.0%":[165],"respectively.":[166]},"counts_by_year":[],"updated_date":"2026-01-29T23:17:01.242718","created_date":"2026-01-29T00:00:00"}
