{"id":"https://openalex.org/W4392824057","doi":"https://doi.org/10.1145/3638584.3638624","title":"Image Segmentation with Vision-Language Models","display_name":"Image Segmentation with Vision-Language Models","publication_year":2023,"publication_date":"2023-12-08","ids":{"openalex":"https://openalex.org/W4392824057","doi":"https://doi.org/10.1145/3638584.3638624"},"language":"en","primary_location":{"id":"doi:10.1145/3638584.3638624","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3638584.3638624","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3638584.3638624","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2023 7th International Conference on Computer Science and Artificial Intelligence","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3638584.3638624","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5089770514","display_name":"Lihu Pan","orcid":"https://orcid.org/0000-0001-9530-694X"},"institutions":[{"id":"https://openalex.org/I46305995","display_name":"Taiyuan University of Science and Technology","ror":"https://ror.org/01wcbdc92","country_code":"CN","type":"education","lineage":["https://openalex.org/I46305995"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Lihu Pan","raw_affiliation_strings":["Taiyuan University of Science and Technology, China"],"raw_orcid":"https://orcid.org/0000-0001-9530-694X","affiliations":[{"raw_affiliation_string":"Taiyuan University of Science and Technology, China","institution_ids":["https://openalex.org/I46305995"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062109496","display_name":"Yunting Yang","orcid":"https://orcid.org/0009-0007-2195-622X"},"institutions":[{"id":"https://openalex.org/I46305995","display_name":"Taiyuan University of Science and Technology","ror":"https://ror.org/01wcbdc92","country_code":"CN","type":"education","lineage":["https://openalex.org/I46305995"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yunting Yang","raw_affiliation_strings":["Taiyuan University of Science and Technology, China"],"raw_orcid":"https://orcid.org/0009-0007-2195-622X","affiliations":[{"raw_affiliation_string":"Taiyuan University of Science and Technology, China","institution_ids":["https://openalex.org/I46305995"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020485549","display_name":"Zhengkui Wang","orcid":"https://orcid.org/0000-0003-4554-0791"},"institutions":[{"id":"https://openalex.org/I168639165","display_name":"Singapore Institute of Technology","ror":"https://ror.org/01v2c2791","country_code":"SG","type":"education","lineage":["https://openalex.org/I168639165"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Zhengkui Wang","raw_affiliation_strings":["InfoComm Technology Cluster, Singapore Institute of Technology, Singapore"],"raw_orcid":"https://orcid.org/0000-0003-4554-0791","affiliations":[{"raw_affiliation_string":"InfoComm Technology Cluster, Singapore Institute of Technology, Singapore","institution_ids":["https://openalex.org/I168639165"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064510205","display_name":"Rui Zhang","orcid":"https://orcid.org/0000-0001-7767-7413"},"institutions":[{"id":"https://openalex.org/I46305995","display_name":"Taiyuan University of Science and Technology","ror":"https://ror.org/01wcbdc92","country_code":"CN","type":"education","lineage":["https://openalex.org/I46305995"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rui Zhang","raw_affiliation_strings":["Taiyuan University of Science and Technology, China"],"raw_orcid":"https://orcid.org/0000-0001-7767-7413","affiliations":[{"raw_affiliation_string":"Taiyuan University of Science and Technology, China","institution_ids":["https://openalex.org/I46305995"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101576537","display_name":"Wen Shan","orcid":"https://orcid.org/0000-0002-7377-8943"},"institutions":[{"id":"https://openalex.org/I8696757","display_name":"Singapore University of Social Sciences","ror":"https://ror.org/01s57k749","country_code":"SG","type":"education","lineage":["https://openalex.org/I8696757"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Wen Shan","raw_affiliation_strings":["Singapore University of Social Sciences, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-7377-8943","affiliations":[{"raw_affiliation_string":"Singapore University of Social Sciences, Singapore","institution_ids":["https://openalex.org/I8696757"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5070359684","display_name":"Jiashu Li","orcid":"https://orcid.org/0009-0009-6686-3857"},"institutions":[{"id":"https://openalex.org/I46305995","display_name":"Taiyuan University of Science and Technology","ror":"https://ror.org/01wcbdc92","country_code":"CN","type":"education","lineage":["https://openalex.org/I46305995"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiashu Li","raw_affiliation_strings":["Taiyuan University of Science and Technology, China"],"raw_orcid":"https://orcid.org/0009-0009-6686-3857","affiliations":[{"raw_affiliation_string":"Taiyuan University of Science and Technology, China","institution_ids":["https://openalex.org/I46305995"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5089770514"],"corresponding_institution_ids":["https://openalex.org/I46305995"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.20389705,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"233","last_page":"238"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9919999837875366,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.730174720287323},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6813842058181763},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.65565025806427},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.6369544863700867},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.4724726378917694},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.4433751702308655}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.730174720287323},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6813842058181763},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.65565025806427},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.6369544863700867},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.4724726378917694},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.4433751702308655}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3638584.3638624","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3638584.3638624","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3638584.3638624","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2023 7th International Conference on Computer Science and Artificial Intelligence","raw_type":"proceedings-article"},{"id":"pmh:oai:figshare.com:article/28182329","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4377196282","display_name":"Figshare","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210132348","host_organization_name":"Figshare (United Kingdom)","host_organization_lineage":["https://openalex.org/I4210132348"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Conference contribution"}],"best_oa_location":{"id":"doi:10.1145/3638584.3638624","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3638584.3638624","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3638584.3638624","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2023 7th International Conference on Computer Science and Artificial Intelligence","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4392824057.pdf","grobid_xml":"https://content.openalex.org/works/W4392824057.grobid-xml"},"referenced_works_count":25,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2125215748","https://openalex.org/W2331011752","https://openalex.org/W2879390606","https://openalex.org/W2924485953","https://openalex.org/W2983850069","https://openalex.org/W3047258141","https://openalex.org/W3049293589","https://openalex.org/W3104749918","https://openalex.org/W3106029750","https://openalex.org/W3106906018","https://openalex.org/W3159619744","https://openalex.org/W3167453437","https://openalex.org/W3202427362","https://openalex.org/W4230056077","https://openalex.org/W4236345650","https://openalex.org/W4287775237","https://openalex.org/W4294770622","https://openalex.org/W4296181602","https://openalex.org/W4312420092","https://openalex.org/W4312795220","https://openalex.org/W4313203522","https://openalex.org/W4313547380","https://openalex.org/W4315607807","https://openalex.org/W4385380722"],"related_works":["https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2772917594","https://openalex.org/W2775347418","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Image":[0],"segmentation":[1,29,45,61,66,109,144],"traditionally":[2],"relies":[3],"on":[4,25,32,72,82],"predefined":[5],"object":[6],"classes,":[7],"which":[8],"can":[9],"pose":[10],"challenges":[11],"when":[12],"accommodating":[13],"new":[14],"categories":[15],"or":[16,76],"complex":[17],"queries,":[18],"often":[19],"necessitating":[20],"model":[21,62,80,120],"retraining.":[22],"Relying":[23],"solely":[24],"visual":[26,92],"information":[27],"for":[28,68,134],"heavily":[30],"depends":[31],"annotated":[33],"samples,":[34],"and":[35,115,124,150],"as":[36],"the":[37,43,95,98],"number":[38],"of":[39,97,105,143],"unknown":[40],"classes":[41],"increases,":[42],"model\u2019s":[44],"performance":[46],"experiences":[47],"significant":[48],"declines.":[49],"To":[50],"address":[51],"these":[52],"challenges,":[53],"this":[54],"paper":[55],"introduces":[56],"ViLaSeg,":[57],"an":[58],"innovative":[59],"image":[60,123],"that":[63],"generates":[64],"binary":[65],"maps":[67],"query":[69],"images":[70],"based":[71],"either":[73],"free-text":[74],"prompts":[75,84,93],"support":[77],"images.":[78],"Our":[79],"capitalizes":[81],"text":[83,125],"to":[85,101],"establish":[86],"comprehensive":[87],"contextual":[88],"logical":[89],"relationships,":[90],"while":[91],"harness":[94],"power":[96],"GroupViT":[99],"encoder":[100],"capture":[102],"local":[103],"features":[104],"multiple":[106],"objects,":[107],"enhancing":[108],"precision.":[110],"By":[111],"employing":[112],"selective":[113],"attention":[114],"facilitating":[116],"cross-modal":[117],"interactions,":[118],"our":[119],"seamlessly":[121],"fuses":[122],"features,":[126],"further":[127],"refined":[128],"by":[129],"a":[130,141],"transformer-based":[131],"decoder":[132],"designed":[133],"dense":[135],"prediction":[136],"tasks.":[137],"ViLaSeg":[138],"excels":[139],"across":[140],"spectrum":[142],"tasks,":[145],"including":[146],"referring":[147],"expression,":[148],"zero-shot,":[149],"one-shot":[151],"segmentation,":[152],"surpassing":[153],"prior":[154],"state-of-the-art":[155],"approaches.":[156]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
