{"id":"https://openalex.org/W4403182220","doi":"https://doi.org/10.1109/tcsvt.2024.3474698","title":"CLIP-VIS: Adapting CLIP for Open-Vocabulary Video Instance Segmentation","display_name":"CLIP-VIS: Adapting CLIP for Open-Vocabulary Video Instance Segmentation","publication_year":2024,"publication_date":"2024-10-07","ids":{"openalex":"https://openalex.org/W4403182220","doi":"https://doi.org/10.1109/tcsvt.2024.3474698"},"language":"en","primary_location":{"id":"doi:10.1109/tcsvt.2024.3474698","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2024.3474698","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100301546","display_name":"Wenqi Zhu","orcid":"https://orcid.org/0009-0008-0357-441X"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenqi Zhu","raw_affiliation_strings":["School of Electrical and Information Engineering, Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0009-0008-0357-441X","affiliations":[{"raw_affiliation_string":"School of Electrical and Information Engineering, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037129512","display_name":"Jiale Cao","orcid":"https://orcid.org/0000-0002-5160-6841"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiale Cao","raw_affiliation_strings":["School of Electrical and Information Engineering, Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0000-0002-5160-6841","affiliations":[{"raw_affiliation_string":"School of Electrical and Information Engineering, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089409762","display_name":"Jin Xie","orcid":"https://orcid.org/0000-0001-6978-8834"},"institutions":[{"id":"https://openalex.org/I158842170","display_name":"Chongqing University","ror":"https://ror.org/023rhb549","country_code":"CN","type":"education","lineage":["https://openalex.org/I158842170"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jin Xie","raw_affiliation_strings":["School of Big Data and Software Engineering, Chongqing University, Chongqing, China"],"raw_orcid":"https://orcid.org/0000-0001-6978-8834","affiliations":[{"raw_affiliation_string":"School of Big Data and Software Engineering, Chongqing University, Chongqing, China","institution_ids":["https://openalex.org/I158842170"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065854845","display_name":"Shuangming Yang","orcid":"https://orcid.org/0000-0002-8044-0860"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuangming Yang","raw_affiliation_strings":["School of Electrical and Information Engineering, Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0000-0002-8044-0860","affiliations":[{"raw_affiliation_string":"School of Electrical and Information Engineering, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5086887025","display_name":"Yanwei Pang","orcid":"https://orcid.org/0000-0001-6670-3727"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanwei Pang","raw_affiliation_strings":["School of Electrical and Information Engineering, Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0000-0001-6670-3727","affiliations":[{"raw_affiliation_string":"School of Electrical and Information Engineering, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.1383,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.89308971,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":"35","issue":"2","first_page":"1098","last_page":"1110"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.972599983215332,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7221184372901917},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.5864733457565308},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5704643130302429},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5591999888420105},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.5441095232963562},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.4627198576927185}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7221184372901917},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5864733457565308},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5704643130302429},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5591999888420105},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.5441095232963562},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.4627198576927185},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tcsvt.2024.3474698","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2024.3474698","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.6299999952316284}],"awards":[{"id":"https://openalex.org/G2199262417","display_name":null,"funder_award_id":"62206031","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5074938571","display_name":null,"funder_award_id":"62271346","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":70,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2194775991","https://openalex.org/W2222512263","https://openalex.org/W2252355370","https://openalex.org/W2889986507","https://openalex.org/W2948672349","https://openalex.org/W2962803115","https://openalex.org/W2962914239","https://openalex.org/W2963150697","https://openalex.org/W2970231061","https://openalex.org/W2982723417","https://openalex.org/W2993182889","https://openalex.org/W3090449556","https://openalex.org/W3097885906","https://openalex.org/W3106546328","https://openalex.org/W3110109236","https://openalex.org/W3134837903","https://openalex.org/W3171516518","https://openalex.org/W3173980723","https://openalex.org/W3202509201","https://openalex.org/W3212555189","https://openalex.org/W4312275238","https://openalex.org/W4312294051","https://openalex.org/W4312396403","https://openalex.org/W4312424618","https://openalex.org/W4312443924","https://openalex.org/W4312458986","https://openalex.org/W4312547276","https://openalex.org/W4312573566","https://openalex.org/W4312747482","https://openalex.org/W4312815172","https://openalex.org/W4312935996","https://openalex.org/W4312956471","https://openalex.org/W4313007081","https://openalex.org/W4313192411","https://openalex.org/W4319300361","https://openalex.org/W4321194930","https://openalex.org/W4381785648","https://openalex.org/W4385245566","https://openalex.org/W4386066071","https://openalex.org/W4386071948","https://openalex.org/W4386075561","https://openalex.org/W4386075819","https://openalex.org/W4386075882","https://openalex.org/W4386076040","https://openalex.org/W4386076397","https://openalex.org/W4386076694","https://openalex.org/W4386083103","https://openalex.org/W4387350728","https://openalex.org/W4390873537","https://openalex.org/W4390873791","https://openalex.org/W4390874207","https://openalex.org/W4390874578","https://openalex.org/W4391428214","https://openalex.org/W4404893957","https://openalex.org/W6757817989","https://openalex.org/W6766904570","https://openalex.org/W6791353385","https://openalex.org/W6803953248","https://openalex.org/W6809716307","https://openalex.org/W6811433417","https://openalex.org/W6839371072","https://openalex.org/W6842162983","https://openalex.org/W6843861238","https://openalex.org/W6846007759","https://openalex.org/W6848927509","https://openalex.org/W6853386698","https://openalex.org/W6855205047","https://openalex.org/W6858987627","https://openalex.org/W6860065554"],"related_works":["https://openalex.org/W2058170566","https://openalex.org/W2772917594","https://openalex.org/W2755342338","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398","https://openalex.org/W2775347418"],"abstract_inverted_index":{"Open-vocabulary":[0],"video":[1,53,173],"instance":[2,54,163,174],"segmentation":[3,175],"strives":[4],"to":[5,11,48,98,133],"segment":[6],"and":[7,61,72,89,102,106,143,151,165,199,204,218,227],"track":[8],"instances":[9],"belonging":[10],"an":[12],"open":[13],"set":[14,78,209],"of":[15,79,162,181,202,210],"categories":[16,164],"in":[17,33],"a":[18,42,77,86,90],"videos.":[19],"The":[20,167],"vision-language":[21],"model":[22],"Contrastive":[23],"Language-Image":[24],"Pre-training":[25],"(CLIP)":[26],"has":[27],"shown":[28],"robust":[29],"zero-shot":[30],"classification":[31,128,147],"ability":[32],"image-level":[34],"open-vocabulary":[35,52,74,127],"tasks.":[36],"In":[37],"this":[38],"paper,":[39],"we":[40],"propose":[41],"simple":[43],"encoder-decoder":[44],"network,":[45],"called":[46],"CLIP-VIS,":[47],"adapt":[49],"CLIP":[50,60,94,139],"for":[51,186],"segmentation.":[55],"Our":[56,155],"CLIP-VIS":[57,156,195],"adopts":[58],"frozen":[59],"introduces":[62,85],"three":[63],"modules,":[64],"including":[65],"class-agnostic":[66,82],"mask":[67,83,107,131,152],"generation,":[68],"temporal":[69,111],"topK-enhanced":[70,112],"matching,":[71],"weighted":[73,126,146],"classification.":[75],"Given":[76],"initial":[80],"queries,":[81],"generation":[84],"pixel":[87],"decoder":[88,92],"transformer":[91],"on":[93,171,206],"pre-trained":[95,140],"image":[96,141],"encoder":[97],"predict":[99],"query":[100,115,135],"masks":[101],"corresponding":[103],"object":[104,149],"scores":[105,150,201],"IoU":[108,153],"scores.":[109,154],"Then,":[110],"matching":[113,116],"performs":[114,145],"across":[117],"frames":[118],"using":[119,148,190],"the":[120,160,179,197,207,224],"K":[121],"mostly":[122],"matched":[123],"frames.":[124],"Finally,":[125],"first":[129],"employs":[130],"pooling":[132],"generate":[134],"visual":[136],"features":[137],"from":[138],"encoder,":[142],"second":[144],"does":[157],"not":[158],"require":[159],"annotations":[161],"identities.":[166],"experiments":[168],"are":[169],"performed":[170],"various":[172],"datasets,":[176],"which":[177,213],"demonstrate":[178],"effectiveness":[180],"our":[182,194],"proposed":[183],"method,":[184],"especially":[185],"novel":[187],"categories.":[188],"When":[189],"ConvNeXt-B":[191],"as":[192],"backbone,":[193],"achieves":[196],"AP":[198],"APn":[200],"32.2%":[203],"40.2%":[205],"validation":[208],"LV-VIS":[211],"dataset,":[212],"outperforms":[214],"OV2Seg":[215],"by":[216],"11.1%":[217],"23.9%":[219],"respectively.":[220],"We":[221],"will":[222],"release":[223],"source":[225],"code":[226],"models":[228],"at":[229],"<uri":[230],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[231],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">https://github.com/zwq456/CLIP-VIS.git</uri>.":[232]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
