{"id":"https://openalex.org/W7138029764","doi":"https://doi.org/10.48550/arxiv.2603.13951","title":"DCP-CLIP:A Coarse-to-Fine Framework for Open-Vocabulary Semantic Segmentation with Dual Interaction","display_name":"DCP-CLIP:A Coarse-to-Fine Framework for Open-Vocabulary Semantic Segmentation with Dual Interaction","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138029764","doi":"https://doi.org/10.48550/arxiv.2603.13951"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.13951","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13951","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.13951","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129692306","display_name":"Jing H Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129721439","display_name":"Huimin Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Huimin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129643984","display_name":"Quan Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Quan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129730033","display_name":"Qibo Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Qibo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061213204","display_name":"SuoFei ZHANG","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Suofei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129714145","display_name":"Huimin Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Huimin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9667999744415283,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9667999744415283,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.00839999970048666,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0066999997943639755,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.7828999757766724},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.7644000053405762},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6186000108718872},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.453900009393692},{"id":"https://openalex.org/keywords/scale-space-segmentation","display_name":"Scale-space segmentation","score":0.4196999967098236},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.397599995136261},{"id":"https://openalex.org/keywords/dual","display_name":"Dual (grammatical number)","score":0.3882000148296356},{"id":"https://openalex.org/keywords/segmentation-based-object-categorization","display_name":"Segmentation-based object categorization","score":0.37220001220703125}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7892000079154968},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.7828999757766724},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.7644000053405762},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6186000108718872},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6018000245094299},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.453900009393692},{"id":"https://openalex.org/C65885262","wikidata":"https://www.wikidata.org/wiki/Q7429708","display_name":"Scale-space segmentation","level":4,"score":0.4196999967098236},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.397599995136261},{"id":"https://openalex.org/C2780980858","wikidata":"https://www.wikidata.org/wiki/Q110022","display_name":"Dual (grammatical number)","level":2,"score":0.3882000148296356},{"id":"https://openalex.org/C25694479","wikidata":"https://www.wikidata.org/wiki/Q7446278","display_name":"Segmentation-based object categorization","level":5,"score":0.37220001220703125},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.35409998893737793},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3531999886035919},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3472999930381775},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.3409000039100647},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.33059999346733093},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.3111000061035156},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3028999865055084},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.29440000653266907},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.29280000925064087},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.29269999265670776},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2727999985218048},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C90312973","wikidata":"https://www.wikidata.org/wiki/Q7449052","display_name":"Semantic data model","level":2,"score":0.25540000200271606},{"id":"https://openalex.org/C198942812","wikidata":"https://www.wikidata.org/wiki/Q496618","display_name":"Semantic property","level":2,"score":0.25049999356269836}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.13951","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13951","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.13951","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13951","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.6896132826805115,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"recent":[1],"years":[2],"have":[3],"witnessed":[4],"the":[5,39,72,112,144,157,174],"remarkable":[6],"development":[7],"for":[8,59,181],"open-vocabulary":[9,103],"semantic":[10,108,138,187],"segmentation":[11,134,150,175],"(OVSS)":[12],"using":[13],"visual-language":[14],"foundation":[15],"models,":[16],"yet":[17],"still":[18],"suffer":[19],"from":[20,38,140,156,173],"following":[21],"fundamental":[22],"challenges:":[23],"(1)":[24],"insufficient":[25],"cross-modal":[26],"communications":[27],"between":[28,90],"textual":[29,83,95,121,127,141],"and":[30,33,71,85,94,147,163,205],"visual":[31,145],"spaces,":[32],"(2)":[34],"significant":[35],"computational":[36],"costs":[37],"interactions":[40,89],"with":[41],"massive":[42],"number":[43],"of":[44,77],"categories.":[45],"To":[46],"address":[47],"these":[48],"issues,":[49],"this":[50],"paper":[51],"describes":[52],"a":[53,132],"novel":[54],"coarse-to-fine":[55],"framework,":[56],"called":[57],"DCP-CLIP,":[58],"OVSS.":[60],"Unlike":[61],"prior":[62],"efforts":[63],"that":[64,195],"mainly":[65],"relied":[66],"on":[67,190],"pre-established":[68],"category":[69,179],"content":[70],"inherent":[73],"spatial-class":[74],"interaction":[75],"capability":[76,105],"CLIP,":[78],"we":[79,99,117,130,169],"dynamic":[80],"constructing":[81],"category-relevant":[82],"features":[84,93,122,155],"explicitly":[86],"models":[87],"dual":[88],"spatial":[91,165,171],"image":[92,113],"class":[96],"semantics.":[97],"Specifically,":[98],"first":[100],"leverage":[101,170],"CLIP's":[102],"recognition":[104],"to":[106,111,123,159,177],"identify":[107],"categories":[109],"relevant":[110],"context,":[114],"upon":[115],"which":[116],"dynamically":[118],"generate":[119],"corresponding":[120],"serve":[124],"as":[125],"initial":[126],"guidance.":[128],"Subsequently,":[129],"conduct":[131],"coarse":[133],"by":[135,151,200],"cross-modally":[136],"integrating":[137,152],"information":[139,172],"guidance":[142],"into":[143],"representations":[146],"achieve":[148],"refined":[149],"spatially":[153],"enriched":[154],"encoder":[158],"recover":[160],"fine-grained":[161],"details":[162],"enhance":[164],"resolution.":[166],"In":[167],"final,":[168],"side":[176],"refine":[178],"predictions":[180],"each":[182],"mask,":[183],"facilitating":[184],"more":[185],"precise":[186],"labeling.":[188],"Experiments":[189],"multiple":[191],"OVSS":[192],"benchmarks":[193],"demonstrate":[194],"DCP-CLIP":[196],"outperforms":[197],"existing":[198],"methods":[199],"delivering":[201],"both":[202],"higher":[203],"accuracy":[204],"greater":[206],"efficiency.":[207]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-18T00:00:00"}
