{"id":"https://openalex.org/W7125899521","doi":"https://doi.org/10.1109/smc58881.2025.11343075","title":"HTF: Hierarchical Transformer Fusion for Fine-Grained Sketch-based Retrieval","display_name":"HTF: Hierarchical Transformer Fusion for Fine-Grained Sketch-based Retrieval","publication_year":2025,"publication_date":"2025-10-05","ids":{"openalex":"https://openalex.org/W7125899521","doi":"https://doi.org/10.1109/smc58881.2025.11343075"},"language":null,"primary_location":{"id":"doi:10.1109/smc58881.2025.11343075","is_oa":false,"landing_page_url":"https://doi.org/10.1109/smc58881.2025.11343075","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124086487","display_name":"Ning Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I135237710","display_name":"Zhejiang Normal University","ror":"https://ror.org/01vevwk45","country_code":"CN","type":"education","lineage":["https://openalex.org/I135237710"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Ning Zhang","raw_affiliation_strings":["Zhejiang Normal University,School of Computer Science and Technology,Jinhua,China,321004"],"affiliations":[{"raw_affiliation_string":"Zhejiang Normal University,School of Computer Science and Technology,Jinhua,China,321004","institution_ids":["https://openalex.org/I135237710"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016503919","display_name":"Mudan Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I109935558","display_name":"Ningbo University","ror":"https://ror.org/03et85d35","country_code":"CN","type":"education","lineage":["https://openalex.org/I109935558"]},{"id":"https://openalex.org/I159389169","display_name":"Ningbo University of Technology","ror":"https://ror.org/037dym702","country_code":"CN","type":"education","lineage":["https://openalex.org/I159389169"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mudan Yu","raw_affiliation_strings":["Ningbo University,School of Information Engineering, College of Science and Technology,Ningbo,China,315300"],"affiliations":[{"raw_affiliation_string":"Ningbo University,School of Information Engineering, College of Science and Technology,Ningbo,China,315300","institution_ids":["https://openalex.org/I159389169","https://openalex.org/I109935558"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124106789","display_name":"Chao Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chao Wang","raw_affiliation_strings":["Wuhan University,National Engineering Research Center for Multimedia Software, School of Computer Science,Wuhan,China,430072"],"affiliations":[{"raw_affiliation_string":"Wuhan University,National Engineering Research Center for Multimedia Software, School of Computer Science,Wuhan,China,430072","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5122328122","display_name":"Mithun Mukherjee","orcid":null},"institutions":[{"id":"https://openalex.org/I4210164135","display_name":"Birla Institute of Technology and Science, Pilani - Dubai Campus","ror":"https://ror.org/05jnbme07","country_code":"AE","type":"education","lineage":["https://openalex.org/I4210164135","https://openalex.org/I74796645"]}],"countries":["AE"],"is_corresponding":false,"raw_author_name":"Mithun Mukherjee","raw_affiliation_strings":["Birla Institute of Technology and Science,Department of Computer Science,Dubai,UAE"],"affiliations":[{"raw_affiliation_string":"Birla Institute of Technology and Science,Department of Computer Science,Dubai,UAE","institution_ids":["https://openalex.org/I4210164135"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5124086487"],"corresponding_institution_ids":["https://openalex.org/I135237710"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.71547658,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"2691","last_page":"2696"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.5482000112533569,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.5482000112533569,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.2535000145435333,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.11599999666213989,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sketch","display_name":"Sketch","score":0.6758000254631042},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6312000155448914},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.5221999883651733},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5048999786376953},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.48170000314712524},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.48089998960494995},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.3961000144481659}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7882000207901001},{"id":"https://openalex.org/C2779231336","wikidata":"https://www.wikidata.org/wiki/Q7534724","display_name":"Sketch","level":2,"score":0.6758000254631042},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6312000155448914},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6098999977111816},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.5221999883651733},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5048999786376953},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.48170000314712524},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.48089998960494995},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3961000144481659},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.38269999623298645},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3788999915122986},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3343999981880188},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3142000138759613},{"id":"https://openalex.org/C177606310","wikidata":"https://www.wikidata.org/wiki/Q5674297","display_name":"Adaptability","level":2,"score":0.29490000009536743},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.29409998655319214},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.27790001034736633},{"id":"https://openalex.org/C86034646","wikidata":"https://www.wikidata.org/wiki/Q474311","display_name":"Semantic gap","level":4,"score":0.2639999985694885},{"id":"https://openalex.org/C101814296","wikidata":"https://www.wikidata.org/wiki/Q5439685","display_name":"Feature model","level":3,"score":0.25290000438690186}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/smc58881.2025.11343075","is_oa":false,"landing_page_url":"https://doi.org/10.1109/smc58881.2025.11343075","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Systems, Man, and Cybernetics (SMC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W2032001302","https://openalex.org/W2054508433","https://openalex.org/W2466618734","https://openalex.org/W2467281799","https://openalex.org/W2493181180","https://openalex.org/W2511925527","https://openalex.org/W2752782242","https://openalex.org/W2776402438","https://openalex.org/W2988772405","https://openalex.org/W3034603197","https://openalex.org/W3035035925","https://openalex.org/W3127164946","https://openalex.org/W3138516171","https://openalex.org/W4241811150"],"related_works":[],"abstract_inverted_index":{"Fine-grained":[0],"sketch-based":[1],"image":[2],"retrieval":[3,63,167,190],"(FGSBIR)":[4],"aims":[5],"to":[6,57,141,175,193],"identify":[7],"target":[8],"photographs":[9],"through":[10,37,153,169],"minimal":[11],"sketch":[12,48,67,178],"input,":[13],"yet":[14],"current":[15],"approaches":[16],"face":[17],"critical":[18,76],"limitations":[19],"in":[20],"balancing":[21],"local":[22,110],"feature":[23,90,112,126],"precision":[24],"with":[25,92,114],"global":[26,120],"contextual":[27],"awareness.":[28],"Although":[29],"existing":[30,194],"methods":[31],"predominantly":[32],"focus":[33],"on":[34],"stroke-level":[35,111],"variations":[36],"convolutional":[38],"architectures,":[39],"they":[40],"often":[41],"neglect":[42],"the":[43,61,66,173],"crucial":[44],"interplay":[45],"between":[46],"detailed":[47],"patterns":[49,179],"and":[50,71,116,180,197],"holistic":[51],"structural":[52],"semantics.":[53],"This":[54],"oversight":[55],"leads":[56],"suboptimal":[58],"performance":[59],"during":[60],"early":[62,189],"phases":[64],"when":[65],"input":[68],"remains":[69],"incomplete":[70],"abstract.":[72],"To":[73],"bridge":[74],"this":[75],"gap,":[77],"we":[78,157],"present":[79],"a":[80,103,159],"novel":[81],"Hierarchical":[82],"Transformer":[83,108,117],"Fusion":[84],"(HTF)":[85],"framework":[86],"that":[87,128,165],"synergizes":[88],"dual-branch":[89,104],"extraction":[91,113],"reinforcement":[93,160],"learning":[94],"optimization.":[95],"Our":[96],"architecture":[97],"introduces":[98],"three":[99],"key":[100],"innovations:":[101],"First,":[102],"encoder":[105,118],"combines":[106],"Swin":[107],"for":[109,119],"ResNet-50":[115],"semantic":[121],"understanding,":[122],"providing":[123],"rich":[124],"multi-level":[125],"representations":[127],"facilitate":[129],"effective":[130],"cross-modal":[131,162],"matching.":[132],"Second,":[133],"an":[134],"adaptive":[135],"fusion":[136],"mechanism":[137],"employs":[138],"learnable":[139],"weights":[140],"dynamically":[142],"integrate":[143],"multi-scale":[144],"features":[145],"from":[146],"both":[147],"branches,":[148],"prioritizing":[149],"semantically":[150],"salient":[151],"regions":[152],"content-aware":[154],"attention.":[155],"Third,":[156],"devise":[158],"learning-driven":[161],"alignment":[163],"strategy":[164],"optimizes":[166],"policy":[168],"iterative":[170],"feedback,":[171],"allowing":[172],"model":[174],"accommodate":[176],"diverse":[177],"user":[181],"preferences.":[182],"Experimental":[183],"results":[184],"confirm":[185],"our":[186],"method\u2019s":[187],"superior":[188],"accuracy":[191],"compared":[192],"FG-SBIR":[195],"systems":[196],"baseline":[198],"models.":[199]},"counts_by_year":[],"updated_date":"2026-01-29T23:17:01.242718","created_date":"2026-01-29T00:00:00"}
