{"id":"https://openalex.org/W7161759659","doi":"https://doi.org/10.48550/arxiv.2605.20110","title":"SetCon: Towards Open-Ended Referring Segmentation via Set-Level Concept Prediction","display_name":"SetCon: Towards Open-Ended Referring Segmentation via Set-Level Concept Prediction","publication_year":2026,"publication_date":"2026-05-19","ids":{"openalex":"https://openalex.org/W7161759659","doi":"https://doi.org/10.48550/arxiv.2605.20110"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.20110","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.20110","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.20110","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136591612","display_name":"Zhixiong Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zhixiong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136556083","display_name":"Yizhuo Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yizhuo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013238521","display_name":"Shuangrui Ding","orcid":"https://orcid.org/0000-0001-7033-774X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Shuangrui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136505497","display_name":"Yuhang Zang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zang, Yuhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136533532","display_name":"Shengyuan Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Shengyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136577460","display_name":"Long Xing","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xing, Long","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136608076","display_name":"Yibin Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yibin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136549239","display_name":"Qiaosheng Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Qiaosheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136569491","display_name":"Jiaqi Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jiaqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9140999913215637,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9140999913215637,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.013299999758601189,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.01269999984651804,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.7125999927520752},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5109999775886536},{"id":"https://openalex.org/keywords/categorization","display_name":"Categorization","score":0.4250999987125397},{"id":"https://openalex.org/keywords/scope","display_name":"Scope (computer science)","score":0.4187999963760376},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.41510000824928284},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.41440001130104065},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.34880000352859497}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7734000086784363},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.7125999927520752},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5924000144004822},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5109999775886536},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.4250999987125397},{"id":"https://openalex.org/C2778012447","wikidata":"https://www.wikidata.org/wiki/Q1034415","display_name":"Scope (computer science)","level":2,"score":0.4187999963760376},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.41510000824928284},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.41440001130104065},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3522000014781952},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.34880000352859497},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.32499998807907104},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.3190999925136566},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.30559998750686646},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.30559998750686646},{"id":"https://openalex.org/C65885262","wikidata":"https://www.wikidata.org/wiki/Q7429708","display_name":"Scale-space segmentation","level":4,"score":0.28439998626708984},{"id":"https://openalex.org/C17231256","wikidata":"https://www.wikidata.org/wiki/Q5156540","display_name":"Completeness (order theory)","level":2,"score":0.27300000190734863},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.25699999928474426},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.2513999938964844},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.25040000677108765},{"id":"https://openalex.org/C2778180026","wikidata":"https://www.wikidata.org/wiki/Q18378163","display_name":"Semantic heterogeneity","level":4,"score":0.2502000033855438}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.20110","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.20110","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.20110","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.20110","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Referring":[0],"segmentation":[1,71,134],"grounds":[2],"natural-language":[3,85],"queries":[4],"to":[5,11,57,176],"pixel-level":[6],"masks,":[7],"but":[8],"extending":[9],"it":[10,115],"complex":[12],"scenarios":[13],"with":[14,35,121,136,160],"multiple":[15,43],"instances,":[16],"cross-category":[17],"groups,":[18],"or":[19,37],"open-ended":[20,69],"target":[21,110,122],"sets":[22],"remains":[23],"challenging.":[24],"Previous":[25],"Large":[26],"Vision":[27],"Language":[28],"Model":[29],"(LVLM)-based":[30],"methods":[31],"represent":[32],"referred":[33,168],"targets":[34,44,169],"one":[36],"more":[38],"special":[39],"tokens":[40],"sequentially,":[41],"treating":[42],"as":[45,62,72,91,164],"separate":[46],"outputs":[47],"rather":[48],"than":[49],"a":[50,104,127,179],"coherent":[51],"set":[52],"and":[53,64,77,112,196],"offering":[54],"little":[55],"incentive":[56],"capture":[58],"set-level":[59,74,106],"properties":[60],"such":[61],"completeness":[63],"mutual":[65],"exclusivity.":[66],"We":[67],"reformulate":[68],"referring":[70,188],"explicit":[73],"concept":[75,107,118,143,172],"prediction":[76],"propose":[78],"Set-Concept":[79],"Segmentation":[80],"(SetCon),":[81],"which":[82],"uses":[83],"LVLM-generated":[84],"concepts,":[86],"instead":[87],"of":[88,167],"segmentation-specific":[89],"tokens,":[90],"semantic":[92,100,138],"conditions":[93],"for":[94],"joint":[95],"mask-set":[96],"decoding.":[97],"A":[98],"hierarchical":[99,137],"decomposition":[101],"first":[102],"predicts":[103],"shared":[105],"defining":[108],"the":[109,165],"scope":[111],"then":[113],"refines":[114],"into":[116],"fine-grained":[117],"groups":[119],"aligned":[120],"subsets.":[123],"To":[124],"support":[125],"this,":[126],"two-stage":[128],"annotation":[129],"pipeline":[130],"augments":[131],"existing":[132],"reasoning":[133],"datasets":[135],"supervision":[139],"(236k":[140],"samples,":[141],"784k":[142],"phrases).":[144],"SetCon":[145],"achieves":[146],"state-of-the-art":[147,184],"results":[148,185],"on":[149,154,158,186,194,199],"image":[150],"benchmarks":[151],"(+3.3":[152],"gIoU":[153,157],"gRefCOCO,":[155],"+12.1":[156],"MUSE),":[159],"margins":[161],"that":[162],"grow":[163],"number":[166],"increases.":[170],"The":[171],"interface":[173],"also":[174],"transfers":[175],"video":[177,189],"under":[178],"detect-and-track":[180],"setting,":[181],"yielding":[182],"new":[183],"seven":[187],"benchmarks,":[190],"including":[191],"+10.9":[192],"J&amp;F":[193,198],"MeViS":[195],"+12.4":[197],"Ref-SeCVOS.":[200]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-21T00:00:00"}
