{"id":"https://openalex.org/W4417537686","doi":"https://doi.org/10.1109/iccv51701.2025.02104","title":"Stepping Out of Similar Semantic Space for Open-Vocabulary Segmentation","display_name":"Stepping Out of Similar Semantic Space for Open-Vocabulary Segmentation","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4417537686","doi":"https://doi.org/10.1109/iccv51701.2025.02104"},"language":"en","primary_location":{"id":"doi:10.1109/iccv51701.2025.02104","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.02104","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2506.16058","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yong Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210114105","display_name":"Tsinghua\u2013Berkeley Shenzhen Institute","ror":"https://ror.org/02hhwwz98","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210114105","https://openalex.org/I95457486","https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yong Liu","raw_affiliation_strings":["Tsinghua Shenzhen International Graduate School"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua Shenzhen International Graduate School","institution_ids":["https://openalex.org/I4210114105"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Song-Li Wu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210114105","display_name":"Tsinghua\u2013Berkeley Shenzhen Institute","ror":"https://ror.org/02hhwwz98","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210114105","https://openalex.org/I95457486","https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Song-Li Wu","raw_affiliation_strings":["Tsinghua Shenzhen International Graduate School"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua Shenzhen International Graduate School","institution_ids":["https://openalex.org/I4210114105"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Sule Bai","orcid":null},"institutions":[{"id":"https://openalex.org/I4210114105","display_name":"Tsinghua\u2013Berkeley Shenzhen Institute","ror":"https://ror.org/02hhwwz98","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210114105","https://openalex.org/I95457486","https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sule Bai","raw_affiliation_strings":["Tsinghua Shenzhen International Graduate School"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua Shenzhen International Graduate School","institution_ids":["https://openalex.org/I4210114105"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jiahao Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Jiahao Wang","raw_affiliation_strings":["The University of Hong Kong"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The University of Hong Kong","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yitong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yitong Wang","raw_affiliation_strings":["ByteDance Inc"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"ByteDance Inc","institution_ids":[]}]},{"author_position":"last","author":{"id":null,"display_name":"Yansong Tang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210114105","display_name":"Tsinghua\u2013Berkeley Shenzhen Institute","ror":"https://ror.org/02hhwwz98","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210114105","https://openalex.org/I95457486","https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yansong Tang","raw_affiliation_strings":["Tsinghua Shenzhen International Graduate School"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua Shenzhen International Graduate School","institution_ids":["https://openalex.org/I4210114105"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I4210114105"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.39697548,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"22664","last_page":"22675"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5907999873161316,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5907999873161316,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.09839999675750732,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.08479999750852585,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.8233000040054321},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7843999862670898},{"id":"https://openalex.org/keywords/soundness","display_name":"Soundness","score":0.6887999773025513},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.6682999730110168},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.49799999594688416},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.4560000002384186}],"concepts":[{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.8233000040054321},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7843999862670898},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.761900007724762},{"id":"https://openalex.org/C39920170","wikidata":"https://www.wikidata.org/wiki/Q693083","display_name":"Soundness","level":2,"score":0.6887999773025513},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.6682999730110168},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5782999992370605},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.49799999594688416},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.49320000410079956},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.4560000002384186},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.350600004196167},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.3440000116825104},{"id":"https://openalex.org/C65885262","wikidata":"https://www.wikidata.org/wiki/Q7429708","display_name":"Scale-space segmentation","level":4,"score":0.3276999890804291},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.31200000643730164},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.29440000653266907},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.27639999985694885},{"id":"https://openalex.org/C25694479","wikidata":"https://www.wikidata.org/wiki/Q7446278","display_name":"Segmentation-based object categorization","level":5,"score":0.25760000944137573},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.25529998540878296}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/iccv51701.2025.02104","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.02104","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2506.16058","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.16058","pdf_url":"https://arxiv.org/pdf/2506.16058","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2506.16058","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.16058","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2506.16058","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.16058","pdf_url":"https://arxiv.org/pdf/2506.16058","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Open-vocabulary":[0],"segmentation":[1,5,142],"aims":[2],"to":[3,27,95,101,139],"achieve":[4,16],"of":[6,31,58,108,152,158,181],"arbitrary":[7],"categories":[8],"given":[9],"unlimited":[10],"text":[11],"inputs":[12],"as":[13,61],"guidance.":[14],"To":[15,75],"this,":[17],"recent":[18],"works":[19],"have":[20,37],"focused":[21],"on":[22,41,115,127,166],"developing":[23],"various":[24],"technical":[25],"routes":[26],"exploit":[28],"the":[29,55,67,89,98,124,141,159,177],"potential":[30],"large-scale":[32],"pre-trained":[33],"vision-language":[34],"models":[35],"and":[36,103,146,155,170,179,185],"made":[38],"significant":[39],"progress":[40],"existing":[42,48,113,128,168],"benchmarks.":[43],"However,":[44],"we":[45,78,117,133],"find":[46,118],"that":[47,85,119],"test":[49,129],"sets":[50],"are":[51],"limited":[52],"in":[53],"measuring":[54],"models'":[56],"comprehension":[57],"``open-vocabulary\"":[59],"concepts,":[60],"their":[62,120],"semantic":[63],"space":[64],"closely":[65],"resembles":[66],"training":[68,90,160],"space,":[69,161],"even":[70],"with":[71],"many":[72],"overlapping":[73],"categories.":[74],"this":[76],"end,":[77],"present":[79],"a":[80,105,135],"new":[81],"benchmark":[82,184],"named":[83,137],"OpenBench":[84],"differs":[86],"significantly":[87],"from":[88,123],"semantics.":[91],"It":[92],"is":[93],"designed":[94],"better":[96],"assess":[97],"model's":[99],"ability":[100],"understand":[102],"segment":[104],"wide":[106],"range":[107],"real-world":[109],"concepts.":[110],"When":[111],"testing":[112],"methods":[114],"OpenBench,":[116],"performance":[121,143],"diverges":[122],"conclusions":[125],"drawn":[126],"sets.":[130],"In":[131],"addition,":[132],"propose":[134],"method":[136],"OVSNet":[138,162],"improve":[140],"for":[144],"diverse":[145],"open":[147],"scenarios.":[148],"Through":[149],"elaborate":[150],"fusion":[151],"heterogeneous":[153],"features":[154],"cost-free":[156],"expansion":[157],"achieves":[163],"state-of-the-art":[164],"results":[165],"both":[167],"datasets":[169],"our":[171,182],"proposed":[172,183],"OpenBench.":[173],"Corresponding":[174],"analysis":[175],"demonstrate":[176],"soundness":[178],"effectiveness":[180],"method.":[186]},"counts_by_year":[],"updated_date":"2026-05-06T06:03:25.996018","created_date":"2025-10-10T00:00:00"}
