{"id":"https://openalex.org/W4391128821","doi":"https://doi.org/10.1109/tpami.2024.3357503","title":"Building an Open-Vocabulary Video CLIP Model With Better Architectures, Optimization and Data","display_name":"Building an Open-Vocabulary Video CLIP Model With Better Architectures, Optimization and Data","publication_year":2024,"publication_date":"2024-01-23","ids":{"openalex":"https://openalex.org/W4391128821","doi":"https://doi.org/10.1109/tpami.2024.3357503","pmid":"https://pubmed.ncbi.nlm.nih.gov/38261478"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2024.3357503","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2024.3357503","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5026167547","display_name":"Zuxuan Wu","orcid":"https://orcid.org/0000-0002-8689-5807"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zuxuan Wu","raw_affiliation_strings":["School of Computer Science, Fudan University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-8689-5807","affiliations":[{"raw_affiliation_string":"School of Computer Science, Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075142476","display_name":"Zejia Weng","orcid":"https://orcid.org/0000-0001-9706-6484"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zejia Weng","raw_affiliation_strings":["School of Computer Science, Fudan University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0001-9706-6484","affiliations":[{"raw_affiliation_string":"School of Computer Science, Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101798999","display_name":"Wujian Peng","orcid":"https://orcid.org/0009-0001-6428-276X"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wujian Peng","raw_affiliation_strings":["School of Computer Science, Fudan University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0001-6428-276X","affiliations":[{"raw_affiliation_string":"School of Computer Science, Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091064356","display_name":"Xitong Yang","orcid":"https://orcid.org/0000-0003-4372-241X"},"institutions":[{"id":"https://openalex.org/I3197470489","display_name":"Alpha Omega Alpha Medical Honor Society","ror":"https://ror.org/057q9nn35","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I3197470489"]},{"id":"https://openalex.org/I4210099336","display_name":"Menlo School","ror":"https://ror.org/01240pn49","country_code":"US","type":"education","lineage":["https://openalex.org/I4210099336"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xitong Yang","raw_affiliation_strings":["Meta AI, Menlo Park, CA, USA"],"raw_orcid":"https://orcid.org/0000-0003-4372-241X","affiliations":[{"raw_affiliation_string":"Meta AI, Menlo Park, CA, USA","institution_ids":["https://openalex.org/I3197470489","https://openalex.org/I4210099336"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100413666","display_name":"Ang Li","orcid":"https://orcid.org/0000-0003-4795-8374"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ang Li","raw_affiliation_strings":["Simular AI, San Mateo, CA, USA"],"raw_orcid":"https://orcid.org/0000-0003-4795-8374","affiliations":[{"raw_affiliation_string":"Simular AI, San Mateo, CA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111454036","display_name":"Larry S. Davis","orcid":null},"institutions":[{"id":"https://openalex.org/I66946132","display_name":"University of Maryland, College Park","ror":"https://ror.org/047s2c258","country_code":"US","type":"education","lineage":["https://openalex.org/I66946132"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Larry S. Davis","raw_affiliation_strings":["Department of Computer Science, University of Maryland, College Park, MD, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science, University of Maryland, College Park, MD, USA","institution_ids":["https://openalex.org/I66946132"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5047962986","display_name":"Yu\u2013Gang Jiang","orcid":"https://orcid.org/0000-0002-1907-8567"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yu-Gang Jiang","raw_affiliation_strings":["School of Computer Science, Fudan University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-1907-8567","affiliations":[{"raw_affiliation_string":"School of Computer Science, Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":3.9371,"has_fulltext":false,"cited_by_count":18,"citation_normalized_percentile":{"value":0.94615366,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":"46","issue":"7","first_page":"4747","last_page":"4762"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8427855968475342},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.689346432685852},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.656825065612793},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.5238245129585266},{"id":"https://openalex.org/keywords/interpolation","display_name":"Interpolation (computer graphics)","score":0.4498063325881958},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4468838572502136},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.43240320682525635},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.389930784702301},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.21855810284614563}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8427855968475342},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.689346432685852},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.656825065612793},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.5238245129585266},{"id":"https://openalex.org/C137800194","wikidata":"https://www.wikidata.org/wiki/Q11713455","display_name":"Interpolation (computer graphics)","level":3,"score":0.4498063325881958},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4468838572502136},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.43240320682525635},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.389930784702301},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.21855810284614563},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tpami.2024.3357503","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2024.3357503","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},{"id":"pmid:38261478","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/38261478","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on pattern analysis and machine intelligence","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.5400000214576721}],"awards":[{"id":"https://openalex.org/G472569701","display_name":null,"funder_award_id":"62032006","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6994314671","display_name":null,"funder_award_id":"62102092","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":98,"referenced_works":["https://openalex.org/W24089286","https://openalex.org/W1861492603","https://openalex.org/W2064851185","https://openalex.org/W2100771357","https://openalex.org/W2108598243","https://openalex.org/W2126579184","https://openalex.org/W2277195237","https://openalex.org/W2425121537","https://openalex.org/W2560129302","https://openalex.org/W2560647685","https://openalex.org/W2619947201","https://openalex.org/W2886641317","https://openalex.org/W2887051120","https://openalex.org/W2904378456","https://openalex.org/W2963173418","https://openalex.org/W2963689837","https://openalex.org/W2963783448","https://openalex.org/W2965373098","https://openalex.org/W2984008963","https://openalex.org/W3016459781","https://openalex.org/W3021778166","https://openalex.org/W3035254087","https://openalex.org/W3091825536","https://openalex.org/W3092475441","https://openalex.org/W3126721948","https://openalex.org/W3175939205","https://openalex.org/W3176125528","https://openalex.org/W3176641147","https://openalex.org/W3198675127","https://openalex.org/W3200114289","https://openalex.org/W3203711169","https://openalex.org/W3204588463","https://openalex.org/W3209532394","https://openalex.org/W3216551675","https://openalex.org/W4214612132","https://openalex.org/W4214746887","https://openalex.org/W4214951466","https://openalex.org/W4226173246","https://openalex.org/W4283819124","https://openalex.org/W4285606530","https://openalex.org/W4303648940","https://openalex.org/W4304731170","https://openalex.org/W4309396261","https://openalex.org/W4310877696","https://openalex.org/W4310921506","https://openalex.org/W4312254032","https://openalex.org/W4312282373","https://openalex.org/W4312614039","https://openalex.org/W4312912313","https://openalex.org/W4313034430","https://openalex.org/W4319988532","https://openalex.org/W4324321291","https://openalex.org/W4366735744","https://openalex.org/W4379255958","https://openalex.org/W4382467086","https://openalex.org/W4384918448","https://openalex.org/W4385572645","https://openalex.org/W4386071476","https://openalex.org/W4386271714","https://openalex.org/W4387725629","https://openalex.org/W6600983433","https://openalex.org/W6676497082","https://openalex.org/W6738602802","https://openalex.org/W6749107692","https://openalex.org/W6754337694","https://openalex.org/W6766578407","https://openalex.org/W6769244778","https://openalex.org/W6784318702","https://openalex.org/W6790019176","https://openalex.org/W6791353385","https://openalex.org/W6797206543","https://openalex.org/W6801567822","https://openalex.org/W6802987763","https://openalex.org/W6803224767","https://openalex.org/W6811013733","https://openalex.org/W6811112093","https://openalex.org/W6838329711","https://openalex.org/W6841672634","https://openalex.org/W6843018836","https://openalex.org/W6845730195","https://openalex.org/W6846002521","https://openalex.org/W6846054188","https://openalex.org/W6846310912","https://openalex.org/W6846313647","https://openalex.org/W6846726380","https://openalex.org/W6846867676","https://openalex.org/W6849177959","https://openalex.org/W6849783008","https://openalex.org/W6849786500","https://openalex.org/W6849829975","https://openalex.org/W6849896277","https://openalex.org/W6852172664","https://openalex.org/W6852874933","https://openalex.org/W6853357209","https://openalex.org/W6854866820","https://openalex.org/W6856141230","https://openalex.org/W6857197186","https://openalex.org/W6955071965"],"related_works":["https://openalex.org/W2349784553","https://openalex.org/W3022596247","https://openalex.org/W2601444686","https://openalex.org/W4307058054","https://openalex.org/W4292238148","https://openalex.org/W4323660495","https://openalex.org/W2385319785","https://openalex.org/W2900827440","https://openalex.org/W3167549738","https://openalex.org/W2381983017"],"abstract_inverted_index":{"Despite":[0],"significant":[1,170],"results":[2,160],"achieved":[3],"by":[4,169,195],"Contrastive":[5],"Language-Image":[6],"Pretraining":[7],"(CLIP)":[8],"in":[9,60],"zero-shot":[10,22,40,156,175],"image":[11],"recognition,":[12],"limited":[13],"effort":[14],"has":[15],"been":[16],"made":[17],"exploring":[18],"its":[19],"potential":[20],"for":[21,70],"video":[23,41,66,120,129,139],"recognition.":[24],"This":[25],"paper":[26],"presents":[27],"Open-VCLIP++,":[28],"a":[29,38,64,96,132,153],"simple":[30],"yet":[31],"effective":[32],"framework":[33],"that":[34,75,98,162],"adapts":[35],"CLIP":[36,55,136],"to":[37,56,80,117,137,227],"strong":[39],"classifier,":[42],"capable":[43],"of":[44,102,135,155,178],"identifying":[45],"novel":[46],"actions":[47],"and":[48,108,181,186,198,216],"events":[49],"during":[50,105],"testing.":[51,109],"Open-VCLIP++":[52,77],"minimally":[53],"modifies":[54],"capture":[57],"spatial-temporal":[58],"relationships":[59],"videos,":[61],"thereby":[62],"creating":[63],"specialized":[65],"classifier":[67],"while":[68,220],"striving":[69],"generalization.":[71],"We":[72,200],"formally":[73],"demonstrate":[74,161],"training":[76,107],"is":[78,143],"tantamount":[79],"continual":[81],"learning":[82],"with":[83,128],"zero":[84],"historical":[85],"data.":[86],"To":[87],"address":[88],"this":[89],"problem,":[90],"we":[91,111,173],"introduce":[92],"Interpolated":[93],"Weight":[94],"Optimization,":[95],"technique":[97],"leverages":[99],"the":[100,138,191,206],"advantages":[101],"weight":[103],"interpolation":[104],"both":[106],"Furthermore,":[110],"build":[112],"upon":[113],"large":[114],"language":[115],"models":[116],"produce":[118],"fine-grained":[119],"descriptions.":[121],"These":[122],"detailed":[123],"descriptions":[124],"are":[125],"further":[126],"aligned":[127],"features,":[130],"facilitating":[131],"better":[133],"transfer":[134],"domain.":[140],"Our":[141],"approach":[142,204],"evaluated":[144],"on":[145,183,205],"three":[146],"widely":[147],"used":[148],"action":[149],"recognition":[150],"datasets,":[151],"following":[152],"variety":[154],"evaluation":[157],"protocols.":[158],"The":[159],"our":[163,203],"method":[164],"surpasses":[165],"existing":[166],"state-of-the-art":[167],"techniques":[168],"margins.":[171],"Specifically,":[172],"achieve":[174],"accuracy":[176],"scores":[177],"88.1%,":[179],"58.7%,":[180],"81.2%":[182],"UCF,":[184],"HMDB,":[185],"Kinetics-600":[187],"datasets":[188],"respectively,":[189],"outpacing":[190],"best-performing":[192],"alternative":[193],"methods":[194],"8.5%,":[196],"8.2%,":[197],"12.3%.":[199],"also":[201],"evaluate":[202],"MSR-VTT":[207],"video-text":[208],"retrieval":[209,218],"dataset,":[210],"where":[211],"it":[212],"delivers":[213],"competitive":[214],"video-to-text":[215],"text-to-video":[217],"performance,":[219],"utilizing":[221],"substantially":[222],"less":[223],"fine-tuning":[224],"data":[225],"compared":[226],"other":[228],"methods.":[229]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":10},{"year":2024,"cited_by_count":5}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
