{"id":"https://openalex.org/W7140165990","doi":"https://doi.org/10.48550/arxiv.2603.21999","title":"STENet: Superpixel Token Enhancing Network for RGB-D Salient Object Detection","display_name":"STENet: Superpixel Token Enhancing Network for RGB-D Salient Object Detection","publication_year":2026,"publication_date":"2026-03-23","ids":{"openalex":"https://openalex.org/W7140165990","doi":"https://doi.org/10.48550/arxiv.2603.21999"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.21999","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21999","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.21999","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Chen, Jianlin","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Jianlin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Gongyang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Gongyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Zhijiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zhijiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Chang, Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chang, Liang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Zeng, Dan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeng, Dan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.000699999975040555,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11165","display_name":"Image and Video Quality Assessment","score":0.0003000000142492354,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/fuse","display_name":"Fuse (electrical)","score":0.7318999767303467},{"id":"https://openalex.org/keywords/pixel","display_name":"Pixel","score":0.6743999719619751},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5748999714851379},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.5425000190734863},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5368000268936157},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.45190000534057617},{"id":"https://openalex.org/keywords/filter","display_name":"Filter (signal processing)","score":0.4449999928474426},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4375},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.4352000057697296}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7391999959945679},{"id":"https://openalex.org/C141353440","wikidata":"https://www.wikidata.org/wiki/Q182221","display_name":"Fuse (electrical)","level":2,"score":0.7318999767303467},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7261999845504761},{"id":"https://openalex.org/C160633673","wikidata":"https://www.wikidata.org/wiki/Q355198","display_name":"Pixel","level":2,"score":0.6743999719619751},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5748999714851379},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.5425000190734863},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5368000268936157},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.51910001039505},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.45190000534057617},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.4449999928474426},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4375},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.4352000057697296},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.43369999527931213},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.37880000472068787},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.34880000352859497},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3416999876499176},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.3303999900817871},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.3303000032901764},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.3190999925136566},{"id":"https://openalex.org/C204241405","wikidata":"https://www.wikidata.org/wiki/Q461499","display_name":"Transformation (genetics)","level":3,"score":0.31130000948905945},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3089999854564667},{"id":"https://openalex.org/C31510193","wikidata":"https://www.wikidata.org/wiki/Q1192553","display_name":"Facial recognition system","level":3,"score":0.304500013589859},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2944999933242798},{"id":"https://openalex.org/C4641261","wikidata":"https://www.wikidata.org/wiki/Q11681085","display_name":"Face detection","level":4,"score":0.2816999852657318},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.27549999952316284},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.265500009059906},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.2653000056743622},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.26109999418258667}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.21999","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21999","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.21999","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21999","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Transformer-based":[0],"methods":[1,27],"for":[2,80,102],"RGB-D":[3,25,207],"Salient":[4],"Object":[5],"Detection":[6],"(SOD)":[7],"have":[8],"gained":[9],"significant":[10],"interest,":[11],"owing":[12],"to":[13,18,124,161,199,218],"the":[14,32,36,40,66,89,95,110,118,126,132,149,188,196],"transformer's":[15],"exceptional":[16],"capacity":[17],"capture":[19,139],"long-range":[20],"pixel":[21,157],"dependencies.":[22],"Nevertheless,":[23],"current":[24],"SOD":[26,208],"face":[28],"challenges,":[29],"such":[30],"as":[31],"quadratic":[33],"complexity":[34],"of":[35,98,166,225],"attention":[37],"mechanism":[38],"and":[39,82,107,142,171,190,223],"limited":[41],"local":[42,83,169,178,183],"detail":[43],"extraction.":[44],"To":[45],"overcome":[46],"these":[47,177],"limitations,":[48],"we":[49,87,115,186],"propose":[50,117,148],"a":[51,164],"novel":[52],"Superpixel":[53,119,150],"Token":[54],"Enhancing":[55,122],"Network":[56],"(STENet),":[57],"which":[58,137,155],"introduces":[59],"superpixels":[60,160],"into":[61],"cross-modal":[62,76],"interaction.":[63],"STENet":[64,213],"follows":[65],"two-stream":[67],"encoder-decoder":[68],"structure.":[69],"Its":[70],"cores":[71],"are":[72,228],"two":[73],"tailored":[74],"superpixel-driven":[75],"interaction":[77],"modules,":[78],"responsible":[79],"global":[81,127,134],"feature":[84,174,202],"enhancement.":[85],"Specifically,":[86],"update":[88],"superpixel":[90,112],"generation":[91,113],"method":[92,227],"by":[93],"expanding":[94],"neighborhood":[96],"range":[97],"each":[99],"superpixel,":[100],"allowing":[101],"flexible":[103],"transformation":[104],"between":[105],"pixels":[106,167],"superpixels.":[108],"With":[109],"updated":[111],"method,":[114],"first":[116],"Attention":[120,151],"Global":[121],"Module":[123],"model":[125],"pixel-to-superpixel":[128],"relationship":[129],"rather":[130],"than":[131],"traditional":[133],"pixel-to-pixel":[135],"relationship,":[136],"can":[138],"region-level":[140],"information":[141],"reduce":[143],"computational":[144],"complexity.":[145],"We":[146],"also":[147],"Local":[152],"Refining":[153],"Module,":[154],"leverages":[156],"similarity":[158],"within":[159],"filter":[162],"out":[163],"subset":[165],"(i.e.,":[168],"pixels)":[170],"then":[172],"performs":[173],"enhancement":[175],"on":[176,205],"pixels,":[179],"thereby":[180],"capturing":[181],"concerned":[182],"details.":[184],"Furthermore,":[185],"fuse":[187],"globally":[189],"locally":[191],"enhanced":[192],"features":[193,198],"along":[194],"with":[195],"cross-scale":[197],"achieve":[200],"comprehensive":[201],"representation.":[203],"Experiments":[204],"seven":[206],"datasets":[209],"reveal":[210],"that":[211],"our":[212,226],"achieves":[214],"competitive":[215],"performance":[216],"compared":[217],"state-of-the-art":[219],"methods.":[220],"The":[221],"code":[222],"results":[224],"available":[229],"at":[230],"https://github.com/Mark9010/STENet.":[231]},"counts_by_year":[],"updated_date":"2026-04-25T08:17:42.794288","created_date":"2026-03-25T00:00:00"}
