{"id":"https://openalex.org/W4415540067","doi":"https://doi.org/10.1145/3746027.3755212","title":"From Language to Instance: Generative Visual Prompting for Zero-shot Camouflaged Object Detection","display_name":"From Language to Instance: Generative Visual Prompting for Zero-shot Camouflaged Object Detection","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415540067","doi":"https://doi.org/10.1145/3746027.3755212"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755212","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755212","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5040834836","display_name":"Zihou Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zihou Zhang","raw_affiliation_strings":["Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100348559","display_name":"Hao Li","orcid":"https://orcid.org/0000-0002-1758-5936"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Li","raw_affiliation_strings":["National Engineering Research Center for Multimedia Software, School of Computer Science, Wuhan University, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Multimedia Software, School of Computer Science, Wuhan University, Wuhan, China","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024393567","display_name":"Zhengwei Yang","orcid":"https://orcid.org/0000-0002-8190-1438"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhengwei Yang","raw_affiliation_strings":["National Engineering Research Center for Multimedia Software, School of Computer Science, Wuhan University, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Multimedia Software, School of Computer Science, Wuhan University, Wuhan, China","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090949082","display_name":"Zechao Hu","orcid":"https://orcid.org/0000-0001-6834-6137"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zechao Hu","raw_affiliation_strings":["National Engineering Research Center for Multimedia Software, School of Computer Science, Wuhan University, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Multimedia Software, School of Computer Science, Wuhan University, Wuhan, China","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100420654","display_name":"Liang Li","orcid":"https://orcid.org/0000-0002-1943-8219"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liang Li","raw_affiliation_strings":["Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100401164","display_name":"Zheng Wang","orcid":"https://orcid.org/0000-0003-3846-9157"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zheng Wang","raw_affiliation_strings":["National Engineering Research Center for Multimedia Software, School of Computer Science, Wuhan University, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center for Multimedia Software, School of Computer Science, Wuhan University, Wuhan, China","institution_ids":["https://openalex.org/I37461747"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5040834836"],"corresponding_institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.32566372,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"382","last_page":"391"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.5792999863624573},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.5390999913215637},{"id":"https://openalex.org/keywords/camouflage","display_name":"Camouflage","score":0.5324000120162964},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5307000279426575},{"id":"https://openalex.org/keywords/hallucinating","display_name":"Hallucinating","score":0.5091999769210815},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.5024999976158142},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.48179998993873596},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.45210000872612},{"id":"https://openalex.org/keywords/visual-objects","display_name":"Visual Objects","score":0.44519999623298645},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.43209999799728394}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7840999960899353},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6898999810218811},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.5792999863624573},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.5390999913215637},{"id":"https://openalex.org/C2776196576","wikidata":"https://www.wikidata.org/wiki/Q196113","display_name":"Camouflage","level":2,"score":0.5324000120162964},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5307000279426575},{"id":"https://openalex.org/C2911011789","wikidata":"https://www.wikidata.org/wiki/Q130741","display_name":"Hallucinating","level":2,"score":0.5091999769210815},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.5024999976158142},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.48179998993873596},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.45210000872612},{"id":"https://openalex.org/C2780103172","wikidata":"https://www.wikidata.org/wiki/Q1309721","display_name":"Visual Objects","level":3,"score":0.44519999623298645},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.43209999799728394},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4174000024795532},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4174000024795532},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.4009000062942505},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.39239999651908875},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.38769999146461487},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.37130001187324524},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.3695000112056732},{"id":"https://openalex.org/C17305859","wikidata":"https://www.wikidata.org/wiki/Q382944","display_name":"Soar","level":2,"score":0.3562999963760376},{"id":"https://openalex.org/C2779321571","wikidata":"https://www.wikidata.org/wiki/Q7936605","display_name":"Visual learning","level":2,"score":0.35409998893737793},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.33309999108314514},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.3269999921321869},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.32679998874664307},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.326200008392334},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3149000108242035},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.31279999017715454},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.31029999256134033},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.2806999981403351},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.27889999747276306},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2743000090122223},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.27090001106262207}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755212","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755212","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W3034453930","https://openalex.org/W3164098653","https://openalex.org/W3176152216","https://openalex.org/W3177087374","https://openalex.org/W3210073375","https://openalex.org/W4312258849","https://openalex.org/W4312420092","https://openalex.org/W4312933868","https://openalex.org/W4323897324","https://openalex.org/W4382465636","https://openalex.org/W4386075673","https://openalex.org/W4390874575","https://openalex.org/W4391791458","https://openalex.org/W4392172931","https://openalex.org/W4393159738","https://openalex.org/W4400677507","https://openalex.org/W4400905856","https://openalex.org/W4401770219","https://openalex.org/W4402670231","https://openalex.org/W4402915908","https://openalex.org/W4403791158","https://openalex.org/W4403791303"],"related_works":[],"abstract_inverted_index":{"Traditional":[0],"Camouflaged":[1],"Object":[2],"Detection":[3],"(COD)":[4],"methods":[5],"heavily":[6],"depend":[7],"on":[8,166],"labor-intensive":[9],"annotated":[10],"datasets":[11,169],"which":[12],"require":[13],"extensive":[14],"manual":[15],"effort,":[16],"resulting":[17],"in":[18,55,161],"limited":[19],"generalization.":[20],"While":[21],"recent":[22],"studies":[23],"have":[24],"combined":[25],"Multimodal":[26],"Large":[27],"Language":[28],"Models":[29,34],"(MLLMs)":[30],"and":[31,50,173],"Vision":[32],"Foundation":[33],"(VFMs)":[35],"to":[36,107,126,156],"achieve":[37],"zero-shot":[38],"COD,":[39],"their":[40],"performance":[41],"is":[42,147],"hindered":[43],"by":[44,76,81,133],"modality":[45,114],"gap":[46],"between":[47],"linguistic":[48],"semantics":[49],"fine-grained":[51],"visual":[52,66,85,109,159],"cues,":[53],"especially":[54],"complex":[56],"camouflage":[57],"scenarios.":[58],"In":[59],"this":[60,74],"paper,":[61],"we":[62,93,119],"propose":[63],"Language-to-instance":[64],"generative":[65,90,154],"Prompting":[67],"(LiP),":[68],"a":[69,88,95],"novel":[70],"framework":[71,150],"that":[72,102,151],"addresses":[73],"limitation":[75],"transforming":[77],"text":[78],"prompts":[79,86,132,160],"generated":[80],"MLLMs":[82],"into":[83],"instance-level":[84,158],"through":[87],"text-to-image":[89,153],"process.":[91],"Specifically,":[92],"introduce":[94,120],"Diffusion-driven":[96],"Visual":[97],"Prompt":[98],"Generation":[99],"(DVPG)":[100],"module":[101,125],"leverages":[103],"Stable":[104],"Diffusion":[105],"model":[106,155],"synthesize":[108],"references,":[110],"enabling":[111],"robust":[112],"homogeneous":[113],"matching":[115],"for":[116],"COD.":[117],"Additionally,":[118],"Instruction":[121],"Contrastive":[122],"Reasoning":[123],"(ICR)":[124],"enhance":[127],"the":[128,141,148,171],"semantic":[129],"reliability":[130],"of":[131,143,177],"suppressing":[134],"hallucinated":[135],"concepts":[136],"during":[137],"MLLM":[138],"inference.":[139],"To":[140],"best":[142],"our":[144,178],"knowledge,":[145],"LiP":[146],"first":[149],"utilize":[152],"construct":[157],"COD":[162],"task.":[163],"Extensive":[164],"experiments":[165],"four":[167],"benchmark":[168],"demonstrate":[170],"effectiveness":[172],"strong":[174],"generalization":[175],"ability":[176],"approach.":[179]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-25T00:00:00"}
