{"id":"https://openalex.org/W7118849312","doi":"https://doi.org/10.48550/arxiv.2601.01870","title":"Entity-Guided Multi-Task Learning for Infrared and Visible Image Fusion","display_name":"Entity-Guided Multi-Task Learning for Infrared and Visible Image Fusion","publication_year":2026,"publication_date":"2026-01-05","ids":{"openalex":"https://openalex.org/W7118849312","doi":"https://doi.org/10.48550/arxiv.2601.01870"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.01870","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01870","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.01870","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5070438805","display_name":"Wenyu Shao","orcid":"https://orcid.org/0000-0003-3934-1323"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shao, Wenyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122218818","display_name":"Hongbo Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Hongbo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109782907","display_name":"Y. Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Yunchuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5112121861","display_name":"Ruili Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Ruili","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5070438805"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11659","display_name":"Advanced Image Fusion Techniques","score":0.47929999232292175,"subfield":{"id":"https://openalex.org/subfields/2214","display_name":"Media Technology"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11659","display_name":"Advanced Image Fusion Techniques","score":0.47929999232292175,"subfield":{"id":"https://openalex.org/subfields/2214","display_name":"Media Technology"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.08479999750852585,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.07039999961853027,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5081999897956848},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.4948999881744385},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4851999878883362},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4830999970436096},{"id":"https://openalex.org/keywords/image-fusion","display_name":"Image fusion","score":0.47279998660087585},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.4537999927997589},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.4226999878883362},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4147999882698059},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.3837999999523163}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7967000007629395},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6513000130653381},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5081999897956848},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.4948999881744385},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4851999878883362},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4830999970436096},{"id":"https://openalex.org/C69744172","wikidata":"https://www.wikidata.org/wiki/Q860822","display_name":"Image fusion","level":3,"score":0.47279998660087585},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.4537999927997589},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.4226999878883362},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4147999882698059},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.3837999999523163},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.37560001015663147},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3691999912261963},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3662000000476837},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.35089999437332153},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.349700003862381},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.34929999709129333},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.34450000524520874},{"id":"https://openalex.org/C2781122975","wikidata":"https://www.wikidata.org/wiki/Q16928266","display_name":"Semantic feature","level":2,"score":0.3077999949455261},{"id":"https://openalex.org/C86034646","wikidata":"https://www.wikidata.org/wiki/Q474311","display_name":"Semantic gap","level":4,"score":0.30410000681877136},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.28870001435279846},{"id":"https://openalex.org/C75294576","wikidata":"https://www.wikidata.org/wiki/Q5165192","display_name":"Contextual image classification","level":3,"score":0.28700000047683716},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.2797999978065491},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.27079999446868896},{"id":"https://openalex.org/C511149849","wikidata":"https://www.wikidata.org/wiki/Q7449051","display_name":"Semantic computing","level":3,"score":0.2702000141143799},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25929999351501465},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.25920000672340393},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.2572999894618988},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2526000142097473}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.01870","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01870","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.01870","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01870","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Existing":[0],"text-driven":[1],"infrared":[2,53],"and":[3,26,54,136,141,164,179,206,222,232],"visible":[4,55],"image":[5,56,78,106,134,190],"fusion":[6,46,57,107,191],"approaches":[7],"often":[8],"rely":[9],"on":[10],"textual":[11,36,75,166],"information":[12,76],"at":[13,176,238],"the":[14,31,118,126,139,145,159,184,188,195,227],"sentence":[15],"level,":[16],"which":[17,104,168],"can":[18],"lead":[19],"to":[20,28,72,128,157,226],"semantic":[21,33,86,94,123,142,223],"noise":[22,87],"from":[23,77,88],"redundant":[24],"text":[25,90],"fail":[27],"fully":[29],"exploit":[30],"deeper":[32,131],"value":[34],"of":[35,133,144,187,198],"information.":[37],"To":[38,182],"address":[39],"these":[40],"issues,":[41],"we":[42,193],"propose":[43],"a":[44,109,130],"novel":[45],"approach":[47,60],"named":[48],"Entity-Guided":[49],"Multi-Task":[50],"learning":[51,100],"for":[52],"(EGMT).":[58],"Our":[59],"includes":[61],"three":[62],"key":[63],"innovative":[64],"components:":[65],"(i)":[66],"A":[67,97],"principled":[68],"method":[69],"is":[70,102,154],"proposed":[71],"extract":[73],"entity-level":[74,165],"captions":[79],"generated":[80],"by":[81,172],"large":[82],"vision-language":[83],"models,":[84],"eliminating":[85],"raw":[89],"while":[91],"preserving":[92,217],"critical":[93],"information;":[95],"(ii)":[96],"parallel":[98],"multi-task":[99],"architecture":[101],"constructed,":[103],"integrates":[105],"with":[108],"multi-label":[110,119],"classification":[111,120],"task.":[112],"By":[113],"using":[114],"entities":[115],"as":[116],"pseudo-labels,":[117],"task":[121],"provides":[122],"supervision,":[124],"enabling":[125],"model":[127],"achieve":[129],"understanding":[132],"content":[135],"significantly":[137],"improving":[138],"quality":[140],"density":[143],"fused":[146],"image;":[147],"(iii)":[148],"An":[149],"entity-guided":[150,189],"cross-modal":[151,174],"interactive":[152],"module":[153],"also":[155],"developed":[156],"facilitate":[158],"fine-grained":[160],"interaction":[161],"between":[162],"visual":[163],"features,":[167],"enhances":[169],"feature":[170],"representation":[171],"capturing":[173],"dependencies":[175],"both":[177],"inter-visual":[178],"visual-entity":[180],"levels.":[181],"promote":[183],"wide":[185],"application":[186],"framework,":[192],"release":[194],"entity-annotated":[196],"version":[197],"four":[199],"public":[200],"datasets":[201],"(i.e.,":[202],"TNO,":[203],"RoadScene,":[204],"M3FD,":[205],"MSRS).":[207],"Extensive":[208],"experiments":[209],"demonstrate":[210],"that":[211],"EGMT":[212],"achieves":[213],"superior":[214],"performance":[215],"in":[216],"salient":[218],"targets,":[219],"texture":[220],"details,":[221],"consistency,":[224],"compared":[225],"state-of-the-art":[228],"methods.":[229],"The":[230],"code":[231],"dataset":[233],"will":[234],"be":[235],"publicly":[236],"available":[237],"https://github.com/wyshao-01/EGMT.":[239]},"counts_by_year":[],"updated_date":"2026-01-08T20:10:11.968330","created_date":"2026-01-08T00:00:00"}
