{"id":"https://openalex.org/W7160651790","doi":"https://doi.org/10.48550/arxiv.2605.06592","title":"DINORANKCLIP: DINOv3 Distillation and Injection for Vision-Language Pretraining with High-Order Ranking Consistency","display_name":"DINORANKCLIP: DINOv3 Distillation and Injection for Vision-Language Pretraining with High-Order Ranking Consistency","publication_year":2026,"publication_date":"2026-05-07","ids":{"openalex":"https://openalex.org/W7160651790","doi":"https://doi.org/10.48550/arxiv.2605.06592"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.06592","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06592","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.06592","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135669699","display_name":"Shuyang Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Shuyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125776121","display_name":"Nan Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Nan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135650981","display_name":"Yiming Zhang","orcid":"https://orcid.org/0009-0004-1494-3437"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yiming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135666914","display_name":"Zenghui Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Zenghui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135691898","display_name":"Zhenyu Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Zhenyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9639000296592712,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9639000296592712,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.007000000216066837,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.005499999970197678,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pooling","display_name":"Pooling","score":0.7376999855041504},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.6358000040054321},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5703999996185303},{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.5673999786376953},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.54830002784729},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.4713999927043915},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.4708999991416931},{"id":"https://openalex.org/keywords/concatenation","display_name":"Concatenation (mathematics)","score":0.4401000142097473}],"concepts":[{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.7376999855041504},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.6358000040054321},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5703999996185303},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.5673999786376953},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.54830002784729},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5385000109672546},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5198000073432922},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.4713999927043915},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.4708999991416931},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.46230000257492065},{"id":"https://openalex.org/C87619178","wikidata":"https://www.wikidata.org/wiki/Q126002","display_name":"Concatenation (mathematics)","level":2,"score":0.4401000142097473},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.39800000190734863},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.3508000075817108},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.335999995470047},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.30329999327659607},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.29120001196861267},{"id":"https://openalex.org/C78023250","wikidata":"https://www.wikidata.org/wiki/Q657596","display_name":"Unary operation","level":2,"score":0.2906999886035919},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.2883000075817108},{"id":"https://openalex.org/C159423971","wikidata":"https://www.wikidata.org/wiki/Q177251","display_name":"Associative property","level":2,"score":0.2872999906539917},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2851000130176544},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.2809000015258789},{"id":"https://openalex.org/C81386100","wikidata":"https://www.wikidata.org/wiki/Q7100792","display_name":"Ordinal optimization","level":3,"score":0.2797999978065491},{"id":"https://openalex.org/C191399111","wikidata":"https://www.wikidata.org/wiki/Q64861","display_name":"Transitive relation","level":2,"score":0.271699994802475},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.27090001106262207},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2703999876976013},{"id":"https://openalex.org/C86037889","wikidata":"https://www.wikidata.org/wiki/Q4330127","display_name":"Learning to rank","level":3,"score":0.2599000036716461},{"id":"https://openalex.org/C64543145","wikidata":"https://www.wikidata.org/wiki/Q162942","display_name":"Intersection (aeronautics)","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2581999897956848},{"id":"https://openalex.org/C182306322","wikidata":"https://www.wikidata.org/wiki/Q1779371","display_name":"Order (exchange)","level":2,"score":0.2572000026702881}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.06592","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06592","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.06592","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06592","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.6529822945594788,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Contrastive":[0],"language-image":[1],"pretraining":[2,68],"(CLIP)":[3],"suffers":[4],"from":[5],"two":[6],"structural":[7,223],"weaknesses:":[8],"the":[9,14,25,43,60,84,109,126,138,152,208,213],"symmetric":[10],"InfoNCE":[11],"loss":[12],"discards":[13],"relative":[15,210],"ordering":[16],"among":[17],"unmatched":[18],"in-batch":[19],"pairs,":[20],"and":[21,58,92,103,134,142,147,151,189,202,215],"global":[22],"pooling":[23],"collapses":[24],"visual":[26],"representation":[27],"into":[28,83],"a":[29,47,67,79,88,93,100,104,119,184],"semantic":[30],"bottleneck":[31],"that":[32,70,107,218],"is":[33,55,77,129,158],"poorly":[34],"sensitive":[35],"to":[36,113],"fine-grained":[37,214],"local":[38,222],"structure.":[39],"RANKCLIP":[40,143,203],"partially":[41],"addresses":[42,71],"first":[44,114],"issue":[45],"with":[46,97,131,207],"list-wise":[48],"Plackett-Luce":[49,121],"ranking-consistency":[50],"loss,":[51],"but":[52],"its":[53],"model":[54,123],"strictly":[56],"first-order":[57,148],"inherits":[59],"second":[61],"weakness":[62],"untouched.":[63],"We":[64],"propose":[65],"DINORANKCLIP,":[66],"framework":[69],"both":[72],"jointly.":[73],"Our":[74],"principal":[75],"contribution":[76],"injecting":[78],"frozen":[80],"DINOv3":[81],"teacher":[82],"contrastive":[85],"trunk":[86],"through":[87],"dual-branch":[89],"lightweight":[90],"student":[91],"multi-scale":[94],"fusion":[95],"module":[96],"channel-spatial":[98],"attention,":[99],"self-attention":[101],"refiner,":[102],"conflict-aware":[105],"gate":[106],"preserves":[108],"cross-modal":[110],"alignment":[111],"up":[112],"order.":[115],"Complementarily,":[116],"we":[117],"introduce":[118],"high-order":[120],"ranking":[122],"in":[124,180],"which":[125],"per-position":[127],"utility":[128],"augmented":[130],"attention-parameterised":[132],"pairwise":[133],"tuple-wise":[135],"transition":[136],"terms;":[137],"family":[139],"contains":[140],"CLIP":[141],"as":[144],"nested":[145],"zero-order":[146],"special":[149],"cases,":[150],"optimal":[153],"order":[154,165],"on":[155,169,183,192,212],"every":[156],"benchmark":[157],"$R^*=3$.":[159],"The":[160],"full":[161],"empirical":[162],"study":[163],"--":[164,178],"sweep,":[166],"Fine-grained":[167],"Probe":[168],"five":[170],"datasets,":[171],"four-node":[172],"Modality-Gap":[173],"analysis,":[174],"six-variant":[175],"Fusion":[176],"ablation":[177],"fits":[179],"72":[181],"hours":[182],"single":[185],"eight-GPU":[186],"H100":[187],"node":[188],"trains":[190],"entirely":[191],"Conceptual":[193],"Captions":[194],"3M.":[195],"DINORANKCLIP":[196],"consistently":[197],"outperforms":[198],"CLIP,":[199],"CyCLIP,":[200],"ALIP,":[201],"under":[204],"matched":[205],"compute,":[206],"largest":[209],"gains":[211],"out-of-distribution":[216],"evaluations":[217],"most":[219],"directly":[220],"stress":[221],"reasoning.":[224]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-09T00:00:00"}
