{"id":"https://openalex.org/W3092960051","doi":"https://doi.org/10.1145/3394171.3413990","title":"Cap2Seg: Inferring Semantic and Spatial Context from Captions for Zero-Shot Image Segmentation","display_name":"Cap2Seg: Inferring Semantic and Spatial Context from Captions for Zero-Shot Image Segmentation","publication_year":2020,"publication_date":"2020-10-12","ids":{"openalex":"https://openalex.org/W3092960051","doi":"https://doi.org/10.1145/3394171.3413990","mag":"3092960051"},"language":"en","primary_location":{"id":"doi:10.1145/3394171.3413990","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3394171.3413990","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 28th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5024163245","display_name":"Guiyu Tian","orcid":"https://orcid.org/0009-0005-6262-0146"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Guiyu Tian","raw_affiliation_strings":["Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100328272","display_name":"Shuai Wang","orcid":"https://orcid.org/0000-0002-1595-3619"},"institutions":[{"id":"https://openalex.org/I4210100976","display_name":"BOE Technology Group (China)","ror":"https://ror.org/01cwwvj38","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210100976"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuai Wang","raw_affiliation_strings":["BOE Technology Group Co., Ltd., Beijing, China"],"affiliations":[{"raw_affiliation_string":"BOE Technology Group Co., Ltd., Beijing, China","institution_ids":["https://openalex.org/I4210100976"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052017667","display_name":"Jie Feng","orcid":"https://orcid.org/0000-0001-9496-0034"},"institutions":[{"id":"https://openalex.org/I4210100976","display_name":"BOE Technology Group (China)","ror":"https://ror.org/01cwwvj38","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210100976"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jie Feng","raw_affiliation_strings":["BOE Technology Group Co., Ltd., Beijing, China"],"affiliations":[{"raw_affiliation_string":"BOE Technology Group Co., Ltd., Beijing, China","institution_ids":["https://openalex.org/I4210100976"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100742870","display_name":"Li Zhou","orcid":"https://orcid.org/0000-0001-5743-8490"},"institutions":[{"id":"https://openalex.org/I4210100976","display_name":"BOE Technology Group (China)","ror":"https://ror.org/01cwwvj38","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210100976"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Li Zhou","raw_affiliation_strings":["BOE Technology Group Co., Ltd., Beijing, China"],"affiliations":[{"raw_affiliation_string":"BOE Technology Group Co., Ltd., Beijing, China","institution_ids":["https://openalex.org/I4210100976"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5028877572","display_name":"Yadong Mu","orcid":"https://orcid.org/0000-0001-7815-3750"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yadong Mu","raw_affiliation_strings":["Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5024163245"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":1.0747,"has_fulltext":false,"cited_by_count":13,"citation_normalized_percentile":{"value":0.80026215,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"4125","last_page":"4134"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9934999942779541,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.7174683213233948},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.713699996471405},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5967953205108643},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.5175827741622925},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.5166283249855042},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.4796714782714844},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4747626781463623},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.44727662205696106},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.4366084337234497},{"id":"https://openalex.org/keywords/pixel","display_name":"Pixel","score":0.43577277660369873},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4332507252693176},{"id":"https://openalex.org/keywords/spatial-contextual-awareness","display_name":"Spatial contextual awareness","score":0.427909791469574},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.32272666692733765},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.07032492756843567}],"concepts":[{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7174683213233948},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.713699996471405},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5967953205108643},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.5175827741622925},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.5166283249855042},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4796714782714844},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4747626781463623},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.44727662205696106},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.4366084337234497},{"id":"https://openalex.org/C160633673","wikidata":"https://www.wikidata.org/wiki/Q355198","display_name":"Pixel","level":2,"score":0.43577277660369873},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4332507252693176},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.427909791469574},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32272666692733765},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.07032492756843567},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3394171.3413990","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3394171.3413990","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 28th ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":50,"referenced_works":["https://openalex.org/W137514618","https://openalex.org/W611457968","https://openalex.org/W652269744","https://openalex.org/W1495267108","https://openalex.org/W1529410181","https://openalex.org/W1903029394","https://openalex.org/W1927251054","https://openalex.org/W1945608308","https://openalex.org/W2037227137","https://openalex.org/W2044913453","https://openalex.org/W2128532956","https://openalex.org/W2171061940","https://openalex.org/W2194775991","https://openalex.org/W2221898772","https://openalex.org/W2289084343","https://openalex.org/W2295107390","https://openalex.org/W2296073425","https://openalex.org/W2306289963","https://openalex.org/W2334493732","https://openalex.org/W2337429362","https://openalex.org/W2400717490","https://openalex.org/W2412782625","https://openalex.org/W2552383788","https://openalex.org/W2561196672","https://openalex.org/W2565639579","https://openalex.org/W2601051138","https://openalex.org/W2605214291","https://openalex.org/W2611632661","https://openalex.org/W2630837129","https://openalex.org/W2739450375","https://openalex.org/W2748618181","https://openalex.org/W2783837693","https://openalex.org/W2798376494","https://openalex.org/W2889587173","https://openalex.org/W2910453440","https://openalex.org/W2924485953","https://openalex.org/W2944967134","https://openalex.org/W2947628551","https://openalex.org/W2950133940","https://openalex.org/W2950276680","https://openalex.org/W2963311325","https://openalex.org/W2963499153","https://openalex.org/W2963545832","https://openalex.org/W2964086552","https://openalex.org/W2964121744","https://openalex.org/W2964307109","https://openalex.org/W2981613027","https://openalex.org/W2994827984","https://openalex.org/W3012017706","https://openalex.org/W3100093508"],"related_works":["https://openalex.org/W2081900870","https://openalex.org/W2085033728","https://openalex.org/W4285411112","https://openalex.org/W2171299904","https://openalex.org/W2345479200","https://openalex.org/W1647606319","https://openalex.org/W2922442631","https://openalex.org/W2183306018","https://openalex.org/W4390494008","https://openalex.org/W2119567889"],"abstract_inverted_index":{"Zero-shot":[0],"image":[1,51,56,68,75,88,131,148,171],"segmentation":[2,21,38,52,69],"refers":[3],"to":[4,34],"the":[5,66,80,108,157,160,177],"task":[6],"of":[7,31,49,82,116,127,159,166],"segmenting":[8],"pixels":[9],"from":[10,130],"specific":[11],"unseen":[12,113,128,168],"semantic":[13,26,63],"class.":[14],"Previous":[15],"methods":[16],"mainly":[17],"rely":[18],"on":[19,146],"historic":[20],"tasks,":[22],"such":[23],"as":[24],"using":[25],"embedding":[27,30],"or":[28,135],"word":[29],"class":[32,85,129,178],"names":[33,179],"infer":[35,125],"a":[36,46,83,96,164],"new":[37,84],"model.":[39],"In":[40,106],"this":[41],"work":[42],"we":[43],"describe":[44],"Cap2Seg,":[45,162],"novel":[47],"solution":[48],"zero-shot":[50,67],"that":[53,101],"harnesses":[54],"accompanying":[55],"captions":[57,76,172],"for":[58,65,111,183],"intelligently":[59],"inferring":[60,112],"spatial":[61,92],"and":[62,89,141,151,180],"context":[64],"task.":[70],"As":[71],"our":[72],"main":[73],"insight,":[74],"often":[77],"implicitly":[78],"entail":[79],"occurrence":[81],"in":[86],"an":[87],"its":[90],"most-confident":[91],"distribution.":[93],"We":[94],"define":[95],"contextual":[97],"entailment":[98],"question":[99],"(CEQ)":[100],"tailors":[102],"BERT-like":[103],"text":[104],"models.":[105],"specific,":[107],"proposed":[109,161],"networks":[110],"classes":[114,169],"consists":[115],"three":[117],"branches":[118],"(global":[119],"/":[120,122],"local":[121],"semi-global),":[123],"which":[124],"labels":[126],"level,":[132],"image-stripe":[133],"level":[134,137],"pixel":[136],"respectively.":[138],"Comprehensive":[139],"experiments":[140],"ablation":[142],"studies":[143],"are":[144],"conducted":[145],"two":[147],"benchmarks,":[149],"COCO-stuff":[150],"Pascal":[152],"VOC.":[153],"All":[154],"clearly":[155],"demonstrate":[156],"effectiveness":[158],"including":[163],"set":[165],"hardest":[167],"(i.e.,":[170],"do":[173],"not":[174],"literally":[175],"contain":[176],"direct":[181],"matching":[182],"inference":[184],"fails).":[185]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":4},{"year":2021,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
