{"id":"https://openalex.org/W4403488561","doi":"https://doi.org/10.3233/faia240499","title":"OmniCLIP: Adapting CLIP for Video Recognition with Spatial-Temporal Omni-Scale Feature Learning","display_name":"OmniCLIP: Adapting CLIP for Video Recognition with Spatial-Temporal Omni-Scale Feature Learning","publication_year":2024,"publication_date":"2024-10-16","ids":{"openalex":"https://openalex.org/W4403488561","doi":"https://doi.org/10.3233/faia240499"},"language":"en","primary_location":{"id":"doi:10.3233/faia240499","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia240499","pdf_url":"https://ebooks.iospress.nl/pdf/doi/10.3233/FAIA240499","source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},"type":"book-chapter","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://ebooks.iospress.nl/pdf/doi/10.3233/FAIA240499","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5039853684","display_name":"Mushui Liu","orcid":"https://orcid.org/0000-0002-2909-7702"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Mushui Liu","raw_affiliation_strings":["Zhejiang University"],"affiliations":[{"raw_affiliation_string":"Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111321071","display_name":"Bozheng Li","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bozheng Li","raw_affiliation_strings":["Zhejiang University"],"affiliations":[{"raw_affiliation_string":"Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100722513","display_name":"Yunlong Yu","orcid":"https://orcid.org/0000-0002-4809-9738"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yunlong Yu","raw_affiliation_strings":["Zhejiang University"],"affiliations":[{"raw_affiliation_string":"Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5039853684"],"corresponding_institution_ids":["https://openalex.org/I76130692"],"apc_list":null,"apc_paid":null,"fwci":0.4512,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.71748303,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9937999844551086,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.987500011920929,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6674451231956482},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5714319348335266},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.5362666845321655},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4719725549221039},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4697670638561249},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.45424991846084595},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3439186215400696},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.16358748078346252},{"id":"https://openalex.org/keywords/cartography","display_name":"Cartography","score":0.12895110249519348}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6674451231956482},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5714319348335266},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.5362666845321655},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4719725549221039},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4697670638561249},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.45424991846084595},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3439186215400696},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.16358748078346252},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.12895110249519348},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.3233/faia240499","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia240499","pdf_url":"https://ebooks.iospress.nl/pdf/doi/10.3233/FAIA240499","source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"}],"best_oa_location":{"id":"doi:10.3233/faia240499","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia240499","pdf_url":"https://ebooks.iospress.nl/pdf/doi/10.3233/FAIA240499","source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4403488561.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3147584709","https://openalex.org/W2977677679","https://openalex.org/W1992327129","https://openalex.org/W2381986121","https://openalex.org/W2370918718","https://openalex.org/W2256933480","https://openalex.org/W2027854990","https://openalex.org/W2755342338","https://openalex.org/W2370081953","https://openalex.org/W2779427294"],"abstract_inverted_index":{"Recent":[0],"Vision-Language":[1],"Models":[2],"(VLMs)":[3],"e.g.":[4],"CLIP":[5,26,52],"have":[6,131],"made":[7],"great":[8],"progress":[9],"in":[10,22,30,135,165],"video":[11,40,54,137,140],"recognition.":[12,41],"Despite":[13],"the":[14,18,80,149,170],"improvement":[15],"brought":[16],"by":[17,56],"strong":[19],"visual":[20],"backbone":[21],"extracting":[23],"spatial":[24,107,120],"features,":[25],"still":[27],"falls":[28],"short":[29],"capturing":[31],"and":[32,65,113,124,142],"integrating":[33],"spatial-temporal":[34,67,83],"features":[35,61],"which":[36,69],"is":[37,77,181],"essential":[38],"for":[39,53],"In":[42],"this":[43],"paper,":[44],"we":[45,70,96],"propose":[46],"OmniCLIP,":[47],"a":[48,98,158,166],"framework":[49],"that":[50,85],"adapts":[51],"recognition":[55,144],"focusing":[57],"on":[58,163],"learning":[59],"comprehensive":[60],"encompassing":[62],"spatial,":[63],"temporal,":[64],"dynamic":[66,105],"scales,":[68],"refer":[71],"to":[72,103,117],"as":[73],"omni-scale":[74],"features.":[75,108],"This":[76],"achieved":[78],"through":[79],"design":[81],"of":[82,151,161],"blocks":[84],"include":[86],"parallel":[87],"temporal":[88,93],"adapters":[89],"(PTA),":[90],"enabling":[91],"efficient":[92],"modeling.":[94],"Additionally,":[95],"introduce":[97],"self-prompt":[99],"generator":[100],"(SPG)":[101],"module":[102],"capture":[104],"object":[106,126],"The":[109,146,179],"synergy":[110],"between":[111],"PTA":[112],"SPG":[114],"allows":[115],"OmniCLIP":[116,156],"discern":[118],"varying":[119],"information":[121],"across":[122],"frames":[123],"assess":[125],"scales":[127],"over":[128],"time.":[129],"We":[130],"conducted":[132],"extensive":[133],"experiments":[134],"supervised":[136],"recognition,":[138,141],"few-shot":[139],"zero-shot":[143],"tasks.":[145],"results":[147],"demonstrate":[148],"effectiveness":[150],"our":[152],"method,":[153],"especially":[154],"with":[155,175],"achieving":[157],"top-1":[159],"accuracy":[160],"74.30%":[162],"HMDB51":[164],"16-shot":[167],"setting,":[168],"surpassing":[169],"recent":[171],"MotionPrompt":[172],"approach":[173],"even":[174],"full":[176],"training":[177],"data.":[178],"code":[180],"available":[182],"at":[183],"https://github.com/XiaoBuL/OmniCLIP.":[184]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
