{"id":"https://openalex.org/W4414458634","doi":"https://doi.org/10.1109/tcsvt.2025.3613997","title":"Modeling Cross-Modal Semantic Transformations From Coarse to Fine in CLIP","display_name":"Modeling Cross-Modal Semantic Transformations From Coarse to Fine in CLIP","publication_year":2025,"publication_date":"2025-09-24","ids":{"openalex":"https://openalex.org/W4414458634","doi":"https://doi.org/10.1109/tcsvt.2025.3613997"},"language":"en","primary_location":{"id":"doi:10.1109/tcsvt.2025.3613997","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2025.3613997","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Ziqi Peng","orcid":"https://orcid.org/0009-0004-0123-7023"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Ziqi Peng","raw_affiliation_strings":["Department of Automation, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"Department of Automation, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119713072","display_name":"Zhenyu Qi","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenyu Qi","raw_affiliation_strings":["Institute of Advanced Technology, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"Institute of Advanced Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004299336","display_name":"Yang Cao","orcid":"https://orcid.org/0000-0002-2891-4379"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yang Cao","raw_affiliation_strings":["Department of Automation, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"Department of Automation, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025987638","display_name":"Yu Kang","orcid":"https://orcid.org/0000-0002-8706-3252"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yu Kang","raw_affiliation_strings":["Department of Automation, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"Department of Automation, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5089356327","display_name":"Wenjun Lv","orcid":"https://orcid.org/0000-0002-7583-0944"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenjun Lv","raw_affiliation_strings":["Department of Automation, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"Department of Automation, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.13366448,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"36","issue":"2","first_page":"2164","last_page":"2176"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9768000245094299,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9768000245094299,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9628000259399414,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.6389999985694885},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.557200014591217},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5260999798774719},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5},{"id":"https://openalex.org/keywords/transformation","display_name":"Transformation (genetics)","score":0.4860999882221222},{"id":"https://openalex.org/keywords/interpolation","display_name":"Interpolation (computer graphics)","score":0.4830999970436096},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.48030000925064087},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.4551999866962433},{"id":"https://openalex.org/keywords/aerial-image","display_name":"Aerial image","score":0.44920000433921814},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.42500001192092896}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7943999767303467},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.6389999985694885},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6086999773979187},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.557200014591217},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5260999798774719},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5},{"id":"https://openalex.org/C204241405","wikidata":"https://www.wikidata.org/wiki/Q461499","display_name":"Transformation (genetics)","level":3,"score":0.4860999882221222},{"id":"https://openalex.org/C137800194","wikidata":"https://www.wikidata.org/wiki/Q11713455","display_name":"Interpolation (computer graphics)","level":3,"score":0.4830999970436096},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.48030000925064087},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.4551999866962433},{"id":"https://openalex.org/C2776429412","wikidata":"https://www.wikidata.org/wiki/Q4688011","display_name":"Aerial image","level":3,"score":0.44920000433921814},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.42500001192092896},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3626999855041504},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.35519999265670776},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3352000117301941},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.3273000121116638},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.3127000033855438},{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.29420000314712524},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.2870999872684479},{"id":"https://openalex.org/C2781122975","wikidata":"https://www.wikidata.org/wiki/Q16928266","display_name":"Semantic feature","level":2,"score":0.2851000130176544},{"id":"https://openalex.org/C108882727","wikidata":"https://www.wikidata.org/wiki/Q2991685","display_name":"Solid modeling","level":2,"score":0.28459998965263367},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.28049999475479126},{"id":"https://openalex.org/C191172861","wikidata":"https://www.wikidata.org/wiki/Q7899321","display_name":"Upstream (networking)","level":2,"score":0.2777000069618225},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.2752000093460083},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.27379998564720154},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26910001039505005},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.2678000032901764},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2676999866962433},{"id":"https://openalex.org/C90312973","wikidata":"https://www.wikidata.org/wiki/Q7449052","display_name":"Semantic data model","level":2,"score":0.26579999923706055},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.2651999890804291},{"id":"https://openalex.org/C2776175482","wikidata":"https://www.wikidata.org/wiki/Q1195816","display_name":"Transfer (computing)","level":2,"score":0.2637999951839447},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.2612000107765198},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.2565000057220459},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tcsvt.2025.3613997","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2025.3613997","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2642064385","display_name":null,"funder_award_id":"202407a12020009","funder_id":"https://openalex.org/F4320313610","funder_display_name":"Shanghai Science and Technology Development Foundation"},{"id":"https://openalex.org/G7723707124","display_name":null,"funder_award_id":"62273319","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320313610","display_name":"Shanghai Science and Technology Development Foundation","ror":null},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W12634471","https://openalex.org/W105108737","https://openalex.org/W1977295328","https://openalex.org/W2047643928","https://openalex.org/W2108598243","https://openalex.org/W2138011018","https://openalex.org/W2155904486","https://openalex.org/W2194775991","https://openalex.org/W2533598788","https://openalex.org/W2562153041","https://openalex.org/W2962849408","https://openalex.org/W2963018920","https://openalex.org/W2963283377","https://openalex.org/W2964194231","https://openalex.org/W3012255272","https://openalex.org/W3013267890","https://openalex.org/W3034942609","https://openalex.org/W3037492894","https://openalex.org/W3177096435","https://openalex.org/W3182683290","https://openalex.org/W3198377975","https://openalex.org/W4229453513","https://openalex.org/W4285170134","https://openalex.org/W4289256549","https://openalex.org/W4297697565","https://openalex.org/W4311415873","https://openalex.org/W4312310776","https://openalex.org/W4312559104","https://openalex.org/W4312563428","https://openalex.org/W4312933868","https://openalex.org/W4313175608","https://openalex.org/W4376464610","https://openalex.org/W4379382677","https://openalex.org/W4385245566","https://openalex.org/W4386071547","https://openalex.org/W4386699374","https://openalex.org/W4386790226","https://openalex.org/W4390872357","https://openalex.org/W4393156091","https://openalex.org/W4400904454","https://openalex.org/W4403649753","https://openalex.org/W4406414399"],"related_works":[],"abstract_inverted_index":{"Vision-Language":[0],"Models":[1],"(VLMs)":[2],"like":[3],"CLIP":[4,132],"have":[5],"advanced":[6],"image":[7,27,54],"representation":[8],"through":[9],"open-vocabulary":[10],"semantic":[11,78,152],"alignment.":[12],"Yet,":[13],"existing":[14],"few-shot":[15,53,156],"transfer":[16,34,97],"learning":[17],"methods":[18],"largely":[19],"overlook":[20],"the":[21,77,96,130,143],"intrinsic":[22],"interdependencies":[23],"between":[24,81],"text":[25],"and":[26,72,83,102,117,126,135],"embeddings,":[28],"limiting":[29],"their":[30],"ability":[31],"to":[32,60,91,106,124],"fully":[33],"CLIP\u2019s":[35],"pretrained":[36,147],"capabilities.":[37],"To":[38],"address":[39],"this":[40],"gap,":[41],"we":[42],"propose":[43],"Hyperspherical":[44],"Interpolation":[45],"Variational":[46],"Encoding":[47],"(HIVE),":[48],"a":[49,159],"novel":[50],"method":[51],"for":[52,69,154,162],"classification.":[55],"Our":[56,140],"core":[57],"idea":[58],"is":[59],"shift":[61],"away":[62],"from":[63,89],"directly":[64],"training":[65],"feature":[66,100],"extraction":[67,101],"capabilities":[68,105],"downstream":[70,84,107],"tasks,":[71,138],"instead":[73],"focus":[74],"on":[75,111],"exploring":[76],"transformation":[79],"relationships":[80],"upstream":[82],"tasks.":[85,108],"By":[86],"modeling":[87],"semantics":[88],"coarse":[90],"fine":[92],"granularity,":[93],"HIVE":[94],"enables":[95],"of":[98,145],"original":[99,131],"modality":[103],"alignment":[104],"Extensive":[109],"experiments":[110],"eight":[112],"established":[113],"benchmarks,":[114],"including":[115],"CUB":[116],"EuroSAT,":[118],"validate":[119],"HIVE\u2019s":[120],"efficacy,":[121],"achieving":[122],"up":[123],"46.2%":[125],"80.0%":[127],"improvements":[128],"over":[129],"in":[133],"1-shot":[134],"16-shot":[136],"classification":[137],"respectively.":[139],"work":[141],"underscores":[142],"importance":[144],"preserving":[146],"geometric":[148],"constraints":[149],"while":[150],"exploiting":[151],"hierarchies":[153],"effective":[155],"adaptation,":[157],"providing":[158],"principled":[160],"approach":[161],"vision-language":[163],"model":[164],"customization.":[165]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
