{"id":"https://openalex.org/W4416697877","doi":"https://doi.org/10.1145/3778043","title":"CROMBO: Cross-Modality Bootstrapping for Unified Sketch\u2013Photo Representation Learning","display_name":"CROMBO: Cross-Modality Bootstrapping for Unified Sketch\u2013Photo Representation Learning","publication_year":2025,"publication_date":"2025-11-26","ids":{"openalex":"https://openalex.org/W4416697877","doi":"https://doi.org/10.1145/3778043"},"language":"en","primary_location":{"id":"doi:10.1145/3778043","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3778043","pdf_url":null,"source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5106667254","display_name":"Xingyu Liu","orcid":"https://orcid.org/0009-0009-6064-9104"},"institutions":[{"id":"https://openalex.org/I200845125","display_name":"Nanjing University of Information Science and Technology","ror":"https://ror.org/02y0rxk19","country_code":"CN","type":"education","lineage":["https://openalex.org/I200845125"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xingyu Liu","raw_affiliation_strings":["School of Computer Science, Nanjing University of Information Science and Technology, Nanjing, China","School of Computer Science, Nanjing University of Information Science and Technology, China"],"raw_orcid":"https://orcid.org/0009-0009-6064-9104","affiliations":[{"raw_affiliation_string":"School of Computer Science, Nanjing University of Information Science and Technology, Nanjing, China","institution_ids":["https://openalex.org/I200845125"]},{"raw_affiliation_string":"School of Computer Science, Nanjing University of Information Science and Technology, China","institution_ids":["https://openalex.org/I200845125"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103525798","display_name":"Yan Jiang","orcid":"https://orcid.org/0009-0002-2031-5627"},"institutions":[{"id":"https://openalex.org/I200845125","display_name":"Nanjing University of Information Science and Technology","ror":"https://ror.org/02y0rxk19","country_code":"CN","type":"education","lineage":["https://openalex.org/I200845125"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yan Jiang","raw_affiliation_strings":["School of Computer Science, Nanjing University of Information Science and Technology, Nanjing, China","School of Computer Science, Nanjing University of Information Science and Technology, China"],"raw_orcid":"https://orcid.org/0009-0002-2031-5627","affiliations":[{"raw_affiliation_string":"School of Computer Science, Nanjing University of Information Science and Technology, Nanjing, China","institution_ids":["https://openalex.org/I200845125"]},{"raw_affiliation_string":"School of Computer Science, Nanjing University of Information Science and Technology, China","institution_ids":["https://openalex.org/I200845125"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101450162","display_name":"Xu Cheng","orcid":"https://orcid.org/0000-0003-2355-9010"},"institutions":[{"id":"https://openalex.org/I200845125","display_name":"Nanjing University of Information Science and Technology","ror":"https://ror.org/02y0rxk19","country_code":"CN","type":"education","lineage":["https://openalex.org/I200845125"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xu Cheng","raw_affiliation_strings":["School of Computer Science, Nanjing University of Information Science and Technology, Nanjing, China","School of Computer Science, Nanjing University of Information Science and Technology, China"],"raw_orcid":"https://orcid.org/0000-0003-2355-9010","affiliations":[{"raw_affiliation_string":"School of Computer Science, Nanjing University of Information Science and Technology, Nanjing, China","institution_ids":["https://openalex.org/I200845125"]},{"raw_affiliation_string":"School of Computer Science, Nanjing University of Information Science and Technology, China","institution_ids":["https://openalex.org/I200845125"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102331534","display_name":"Hao Yu","orcid":"https://orcid.org/0000-0002-8298-7181"},"institutions":[{"id":"https://openalex.org/I98381234","display_name":"University of Oulu","ror":"https://ror.org/03yj89h83","country_code":"FI","type":"education","lineage":["https://openalex.org/I98381234"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Hao Yu","raw_affiliation_strings":["University of Oulu, Oulu, Finland","Center for Machine Vision and Signal Analysis, University of Oulu, Finland"],"raw_orcid":"https://orcid.org/0000-0002-8298-7181","affiliations":[{"raw_affiliation_string":"University of Oulu, Oulu, Finland","institution_ids":["https://openalex.org/I98381234"]},{"raw_affiliation_string":"Center for Machine Vision and Signal Analysis, University of Oulu, Finland","institution_ids":["https://openalex.org/I98381234"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100763975","display_name":"Haoyu Chen","orcid":"https://orcid.org/0000-0003-3267-2664"},"institutions":[{"id":"https://openalex.org/I98381234","display_name":"University of Oulu","ror":"https://ror.org/03yj89h83","country_code":"FI","type":"education","lineage":["https://openalex.org/I98381234"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Haoyu Chen","raw_affiliation_strings":["University of Oulu, Oulu, Finland","Center for Machine Vision and Signal Analysis, University of Oulu, Finland"],"raw_orcid":"https://orcid.org/0000-0003-3267-2664","affiliations":[{"raw_affiliation_string":"University of Oulu, Oulu, Finland","institution_ids":["https://openalex.org/I98381234"]},{"raw_affiliation_string":"Center for Machine Vision and Signal Analysis, University of Oulu, Finland","institution_ids":["https://openalex.org/I98381234"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5082301986","display_name":"Guoying Zhao","orcid":"https://orcid.org/0000-0003-3694-206X"},"institutions":[{"id":"https://openalex.org/I98381234","display_name":"University of Oulu","ror":"https://ror.org/03yj89h83","country_code":"FI","type":"education","lineage":["https://openalex.org/I98381234"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Guoying Zhao","raw_affiliation_strings":["University of Oulu, Oulu, Finland","Center for Machine Vision and Signal Analysis, University of Oulu, Finland"],"raw_orcid":"https://orcid.org/0000-0003-3694-206X","affiliations":[{"raw_affiliation_string":"University of Oulu, Oulu, Finland","institution_ids":["https://openalex.org/I98381234"]},{"raw_affiliation_string":"Center for Machine Vision and Signal Analysis, University of Oulu, Finland","institution_ids":["https://openalex.org/I98381234"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5106667254"],"corresponding_institution_ids":["https://openalex.org/I200845125"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.33220413,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"22","issue":"1","first_page":"1","last_page":"18"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.8108999729156494,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.8108999729156494,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10057","display_name":"Face and Expression Recognition","score":0.0430000014603138,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.031300000846385956,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sketch","display_name":"Sketch","score":0.8306000232696533},{"id":"https://openalex.org/keywords/bootstrapping","display_name":"Bootstrapping (finance)","score":0.7911999821662903},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.7052000164985657},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.598800003528595},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5979999899864197},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.5702000260353088},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5126000046730042},{"id":"https://openalex.org/keywords/sketch-recognition","display_name":"Sketch recognition","score":0.46219998598098755}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8569999933242798},{"id":"https://openalex.org/C2779231336","wikidata":"https://www.wikidata.org/wiki/Q7534724","display_name":"Sketch","level":2,"score":0.8306000232696533},{"id":"https://openalex.org/C207609745","wikidata":"https://www.wikidata.org/wiki/Q4944086","display_name":"Bootstrapping (finance)","level":2,"score":0.7911999821662903},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.7052000164985657},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.598800003528595},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5979999899864197},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5856999754905701},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.5702000260353088},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5126000046730042},{"id":"https://openalex.org/C132900626","wikidata":"https://www.wikidata.org/wiki/Q7534733","display_name":"Sketch recognition","level":4,"score":0.46219998598098755},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.4490000009536743},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4138000011444092},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.383899986743927},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.36489999294281006},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3515999913215637},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.34209999442100525},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3296000063419342},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.32170000672340393},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.30820000171661377},{"id":"https://openalex.org/C101814296","wikidata":"https://www.wikidata.org/wiki/Q5439685","display_name":"Feature model","level":3,"score":0.28540000319480896},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.27810001373291016},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.2624000012874603},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.25119999051094055}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3778043","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3778043","pdf_url":null,"source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3590615737","display_name":null,"funder_award_id":"62572249","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W2128543433","https://openalex.org/W2466618734","https://openalex.org/W2685669664","https://openalex.org/W2746642387","https://openalex.org/W2789433008","https://openalex.org/W2801271986","https://openalex.org/W2808287223","https://openalex.org/W2891833067","https://openalex.org/W2903798738","https://openalex.org/W2976288813","https://openalex.org/W3001208314","https://openalex.org/W3024646777","https://openalex.org/W3085694362","https://openalex.org/W3163101577","https://openalex.org/W3196775275","https://openalex.org/W4285254934","https://openalex.org/W4296914379","https://openalex.org/W4312753547","https://openalex.org/W4312802153","https://openalex.org/W4403319676"],"related_works":[],"abstract_inverted_index":{"Sketch\u2013photo":[0],"recognition":[1],"refers":[2],"to":[3,36,88,106,126,156,182],"matching":[4],"hand-drawn":[5],"sketches":[6,140],"with":[7],"their":[8],"corresponding":[9],"photos,":[10],"where":[11],"the":[12,19,22,28,38,42,45,50,57,65,80,86,103,128,132,137,158,162,172,185,204,210,225],"performance":[13,227],"essentially":[14],"depends":[15],"on":[16,220],"how":[17],"well":[18],"representations":[20],"of":[21,76,139,188,206,228],"two":[23,189],"modalities":[24,190],"are":[25],"aligned":[26],"in":[27,131,161,191,209,216],"feature":[29,53,164],"spaces.":[30],"Existing":[31],"works":[32],"bluntly":[33],"force":[34],"models":[35],"reduce":[37],"representation":[39,109,160],"discrepancy":[40,87,105],"between":[41],"modalities,":[43],"making":[44,213],"learning":[46,98,110],"less":[47],"effective.":[48],"Besides,":[49],"current":[51],"symmetric":[52],"extraction":[54,165],"framework":[55,99],"prefers":[56],"photo":[58,133,173],"modality":[59,81,104,134],"for":[60,231],"richer":[61],"information":[62,170],"while":[63,212],"neglecting":[64],"sketch":[66,159,236],"modality.":[67,174],"Driven":[68],"by":[69,135,167,202],"these":[70],"observations,":[71],"we":[72,83,93,117,176],"argue":[73],"that,":[74],"instead":[75],"forcefully":[77],"wiping":[78],"out":[79],"discrepancy,":[82],"may":[84],"utilize":[85],"enhance":[89],"model":[90],"learning.":[91],"Thus,":[92],"propose":[94],"a":[95,112,120,148,178,192],"Cross-Modality":[96],"Bootstrapping":[97,123],"(CROMBO)":[100],"that":[101],"utilizes":[102],"bootstrap":[107],"cross-modality":[108],"via":[111],"differentiated":[113],"interaction":[114],"manner.":[115],"Specifically,":[116],"first":[118],"present":[119],"Sketch":[121,150],"Implicit":[122],"(SIB)":[124],"module":[125,153],"magnify":[127],"recognizable":[129],"elements":[130],"utilizing":[136],"characteristic":[138],"having":[141],"only":[142],"contours":[143],"and":[144,242],"key":[145],"details.":[146],"Second,":[147],"Photo-driven":[149],"Refinement":[151],"(PSR)":[152],"is":[154],"developed":[155],"guide":[157],"shared":[163],"process":[166],"supplementing":[168],"rich":[169],"from":[171],"Moreover,":[175],"design":[177],"second-order":[179],"alignment":[180],"strategy":[181],"dynamically":[183],"align":[184],"latent":[186],"distribution":[187],"Hilbert":[193],"space.":[194],"Also,":[195],"our":[196,229],"CROMBO":[197,230],"can":[198],"learn":[199],"fewer":[200],"parameters":[201],"freezing":[203],"weights":[205],"shallow":[207],"layers":[208],"backbone":[211],"no":[214],"sacrifice":[215],"performance.":[217],"Extensive":[218],"experiments":[219],"six":[221],"public":[222],"datasets":[223],"verify":[224],"superior":[226],"sketch\u2013photo-based":[232],"tasks,":[233],"such":[234],"as":[235],"re-identification":[237],"(Re-ID),":[238],"sketch\u2013photo":[239],"face":[240],"recognition,":[241],"sketch-based":[243],"image":[244],"retrieval.":[245]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-11-27T00:00:00"}
