{"id":"https://openalex.org/W4385688009","doi":"https://doi.org/10.1145/3605801.3605832","title":"Semantic Queries with Transformer for Referring Image Segmentation","display_name":"Semantic Queries with Transformer for Referring Image Segmentation","publication_year":2023,"publication_date":"2023-06-16","ids":{"openalex":"https://openalex.org/W4385688009","doi":"https://doi.org/10.1145/3605801.3605832"},"language":"en","primary_location":{"id":"doi:10.1145/3605801.3605832","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3605801.3605832","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2023 2nd International Conference on Networks, Communications and Information Technology","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103201992","display_name":"Yukun Zhai","orcid":"https://orcid.org/0000-0002-2312-8651"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yukun Zhai","raw_affiliation_strings":["Beijing Institute of Technology, China"],"raw_orcid":"https://orcid.org/0000-0002-2312-8651","affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, China","institution_ids":["https://openalex.org/I125839683"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5103201992"],"corresponding_institution_ids":["https://openalex.org/I125839683"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.08420558,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"161","last_page":"167"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9930999875068665,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8594626188278198},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5904024243354797},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5716946721076965},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5683886408805847},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.566489577293396},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5025081634521484},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.416143536567688},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.37805742025375366},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.08596992492675781}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8594626188278198},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5904024243354797},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5716946721076965},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5683886408805847},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.566489577293396},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5025081634521484},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.416143536567688},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.37805742025375366},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.08596992492675781},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3605801.3605832","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3605801.3605832","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2023 2nd International Conference on Networks, Communications and Information Technology","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.6499999761581421,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W1903029394","https://openalex.org/W2489434015","https://openalex.org/W2565639579","https://openalex.org/W2605127024","https://openalex.org/W2798556392","https://openalex.org/W2894964039","https://openalex.org/W2963109634","https://openalex.org/W2963145877","https://openalex.org/W2964345792","https://openalex.org/W2964935470","https://openalex.org/W2980088508","https://openalex.org/W2993182889","https://openalex.org/W3034325957","https://openalex.org/W3034692043","https://openalex.org/W3035097537","https://openalex.org/W3093025045","https://openalex.org/W3138516171","https://openalex.org/W3169998662","https://openalex.org/W3172522282","https://openalex.org/W3178075329","https://openalex.org/W3187664142","https://openalex.org/W3201770677","https://openalex.org/W3216551675","https://openalex.org/W4200631575","https://openalex.org/W4286982960","https://openalex.org/W4312815172"],"related_works":["https://openalex.org/W2375873920","https://openalex.org/W2146114872","https://openalex.org/W2392060890","https://openalex.org/W2392760275","https://openalex.org/W2083530853","https://openalex.org/W2009831055","https://openalex.org/W2393172683","https://openalex.org/W3211744874","https://openalex.org/W1994626569","https://openalex.org/W4281476908"],"abstract_inverted_index":{"Referring":[0],"image":[1,11,54,116],"segmentation":[2],"aims":[3],"to":[4,13,28,34,89,92,105,133,154,161,168,183],"segment":[5],"the":[6,14,20,36,60,78,93,98,111,147,155,163,185,194],"target":[7],"region":[8],"from":[9,86],"an":[10],"according":[12,153],"query":[15,32],"language":[16,107,159],"description.":[17],"One":[18],"of":[19,67,70,114,137,158],"main":[21],"challenges":[22],"behind":[23],"this":[24,42],"fundamental":[25],"task":[26],"is":[27],"find":[29],"a":[30,46,135,177],"qualitative":[31],"representation":[33],"index":[35],"referred":[37],"object":[38],"or":[39],"stuff.":[40],"In":[41],"study,":[43],"we":[44,175],"introduced":[45],"query-based":[47],"framework":[48],"with":[49],"Transformer":[50],"architecture":[51],"for":[52],"referring":[53],"segmentation,":[55],"dubbed":[56],"SQFormer.":[57],"It":[58],"treats":[59],"sentence":[61,79],"and":[62,81,123,172,201],"word":[63,83,119],"embeddings":[64,80],"as":[65],"components":[66],"two":[68],"types":[69],"semantic":[71,101],"queries:":[72],"(i)":[73],"mask":[74,165],"queries":[75,84,102,120,130],"conditioned":[76],"on":[77,198],"(ii)":[82],"induced":[85],"text":[87],"inputs,":[88],"directly":[90],"attends":[91],"most":[94],"relevant":[95],"areas":[96],"in":[97,141],"image.":[99],"The":[100],"are":[103,131,150],"input-specific":[104],"diverse":[106],"expressions":[108],"while":[109],"maintaining":[110],"prior":[112],"knowledge":[113],"intrinsic":[115],"patterns.":[117],"Concretely,":[118],"enable":[121],"flexible":[122],"adaptive":[124],"interactions":[125],"between":[126],"vision-language":[127],"modalities.":[128],"Mask":[129,143],"obligated":[132],"generate":[134],"set":[136],"prototype":[138,148],"masks.":[139],"Then":[140],"Prototype":[142],"Balance":[144],"(PMB)":[145],"module,":[146],"masks":[149],"weighted":[151],"sum":[152],"holistic":[156],"understanding":[157],"expression":[160],"get":[162],"final":[164],"prediction.":[166],"Besides,":[167],"better":[169],"fuse":[170],"linguistic":[171],"visual":[173],"features,":[174],"propose":[176],"language-aware":[178],"feature":[179],"pyramid":[180],"network":[181],"(LA-FPN)":[182],"enhance":[184],"cross-modal":[186],"alignment.":[187],"Extensive":[188],"experiments":[189],"show":[190],"our":[191],"method":[192],"surpasses":[193],"previous":[195],"state-of-the-art":[196],"approaches":[197],"RefCOCO,":[199],"RefCOCO+,":[200],"G-Ref":[202],"datasets.":[203]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
