{"id":"https://openalex.org/W7134964815","doi":"https://doi.org/10.48550/arxiv.2603.08898","title":"Towards Visual Query Segmentation in the Wild","display_name":"Towards Visual Query Segmentation in the Wild","publication_year":2026,"publication_date":"2026-03-09","ids":{"openalex":"https://openalex.org/W7134964815","doi":"https://doi.org/10.48550/arxiv.2603.08898"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.08898","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.08898","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.08898","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5068953861","display_name":"Bing Fan","orcid":"https://orcid.org/0000-0003-4439-6150"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Bing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128767671","display_name":"Minghao Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Minghao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128690418","display_name":"Hanzhi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Hanzhi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100630430","display_name":"Shaohua Dong","orcid":"https://orcid.org/0009-0008-2685-6217"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Shaohua","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128789997","display_name":"Naga Prudhvi Mareedu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mareedu, Naga Prudhvi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128757039","display_name":"Weishi Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Weishi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128728274","display_name":"Yunhe Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Yunhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128773426","display_name":"Yan Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Yan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128725615","display_name":"Heng Fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Heng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.38690000772476196,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.38690000772476196,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.28189998865127563,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.14259999990463257,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7006000280380249},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.6168000102043152},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5860000252723694},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5565999746322632},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.5519000291824341},{"id":"https://openalex.org/keywords/minimum-bounding-box","display_name":"Minimum bounding box","score":0.5501999855041504},{"id":"https://openalex.org/keywords/bounding-overwatch","display_name":"Bounding overwatch","score":0.5184999704360962},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.4309999942779541}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.838699996471405},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7006000280380249},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6687999963760376},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.6168000102043152},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5860000252723694},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5565999746322632},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.5519000291824341},{"id":"https://openalex.org/C147037132","wikidata":"https://www.wikidata.org/wiki/Q6865426","display_name":"Minimum bounding box","level":3,"score":0.5501999855041504},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.541700005531311},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.5184999704360962},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.4309999942779541},{"id":"https://openalex.org/C111370547","wikidata":"https://www.wikidata.org/wiki/Q7451120","display_name":"Sensory cue","level":2,"score":0.391400009393692},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3702000081539154},{"id":"https://openalex.org/C158495155","wikidata":"https://www.wikidata.org/wiki/Q2369151","display_name":"Visual search","level":2,"score":0.3564999997615814},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.3260999917984009},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.32269999384880066},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.303600013256073},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.3003000020980835},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2720000147819519},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.2669000029563904},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.26600000262260437},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.26409998536109924},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2500999867916107}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.08898","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.08898","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.08898","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.08898","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0,222],"this":[1,78],"paper,":[2],"we":[3,80,176,246],"introduce":[4],"visual":[5,13,36,115],"query":[6,14,116],"segmentation":[7],"(VQS),":[8],"a":[9,48,83,102,114,119,178,205],"new":[10],"paradigm":[11,254],"of":[12,24,27,47,105,159],"localization":[15],"(VQL)":[16],"that":[17],"aims":[18],"to":[19,39,87,135,172,199,248],"segment":[20],"all":[21,58,143,234],"pixel-level":[22,64],"occurrences":[23],"an":[25,30,34,210],"object":[26,59,107],"interest":[28],"in":[29,145],"untrimmed":[31],"video,":[32],"given":[33],"external":[35],"query.":[37],"Compared":[38],"existing":[40,235],"VQL":[41,253],"locating":[42],"only":[43],"the":[44,122,136,157,164,197,202,220,241,251],"last":[45],"appearance":[46],"target":[49,127],"using":[50],"bounding":[51],"boxes,":[52],"VQS":[53],"enables":[54],"more":[55,69,95,257],"comprehensive":[56],"(i.e.,":[57,63],"occurrences)":[60],"and":[61,100,125,129,153,192,232,244,255,260,268],"precise":[62],"masks)":[65],"localization,":[66],"making":[67],"it":[68],"practical":[70,261],"for":[71,169,216],"real-world":[72],"scenarios.":[73],"To":[74,139,156],"foster":[75],"research":[76,259],"on":[77,226,263],"task,":[79],"present":[81,177],"VQS-4K,":[82,227],"large-scale":[84],"benchmark":[85,166],"dedicated":[86],"VQS.":[88,170,264],"Specifically,":[89],"VQS-4K":[90,146,162,243],"contains":[91],"4,111":[92],"videos":[93,144],"with":[94,113,131,150,209],"than":[96],"1.3":[97],"million":[98],"frames":[99],"covers":[101],"diverse":[103],"set":[104],"222":[106],"categories.":[108],"Each":[109],"video":[110,124,198],"is":[111,163],"paired":[112],"defined":[117],"by":[118,189],"frame":[120],"outside":[121],"search":[123],"its":[126,238],"mask,":[128],"annotated":[130],"spatial-temporal":[132],"masklets":[133],"corresponding":[134],"queried":[137],"target.":[138],"ensure":[140],"high":[141],"quality,":[142],"are":[147],"manually":[148],"labeled":[149],"meticulous":[151],"inspection":[152],"iterative":[154],"refinement.":[155],"best":[158],"our":[160,223],"knowledge,":[161],"first":[165],"specifically":[167],"designed":[168],"Furthermore,":[171],"stimulate":[173],"future":[174,258],"research,":[175],"simple":[179],"yet":[180],"effective":[181],"method,":[182],"named":[183],"VQ-SAM,":[184,245],"which":[185],"extends":[186],"SAM":[187],"2":[188],"leveraging":[190],"target-specific":[191],"background":[193],"distractor":[194],"cues":[195],"from":[196],"progressively":[200],"evolve":[201],"memory":[203,212],"through":[204],"novel":[206],"multi-stage":[207],"framework":[208],"adaptive":[211],"generation":[213],"(AMG)":[214],"module":[215],"VQS,":[217],"significantly":[218],"improving":[219],"performance.":[221],"extensive":[224],"experiments":[225],"VQ-SAM":[228],"achieves":[229],"promising":[230],"results":[231,269],"surpasses":[233],"approaches,":[236],"demonstrating":[237],"effectiveness.":[239],"With":[240],"proposed":[242],"expect":[247],"go":[249],"beyond":[250],"current":[252],"inspire":[256],"applications":[262],"Our":[265],"benchmark,":[266],"code,":[267],"will":[270],"be":[271],"made":[272],"publicly":[273],"available.":[274]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-12T00:00:00"}
