{"id":"https://openalex.org/W7154477368","doi":"https://doi.org/10.48550/arxiv.2604.12012","title":"TIPSv2: Advancing Vision-Language Pretraining with Enhanced Patch-Text Alignment","display_name":"TIPSv2: Advancing Vision-Language Pretraining with Enhanced Patch-Text Alignment","publication_year":2026,"publication_date":"2026-04-13","ids":{"openalex":"https://openalex.org/W7154477368","doi":"https://doi.org/10.48550/arxiv.2604.12012"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.12012","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12012","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.12012","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5108773038","display_name":"Bingyi Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Bingyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022123903","display_name":"Koert Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Koert","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038897190","display_name":"Kevis-Kokitsi Maninis","orcid":"https://orcid.org/0000-0003-3776-0049"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Maninis, Kevis-Kokitsi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133676443","display_name":"Kaifeng Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Kaifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086235030","display_name":"Arjun Karpur","orcid":"https://orcid.org/0000-0002-7568-3550"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Karpur, Arjun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127816388","display_name":"Ye Xia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xia, Ye","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005885438","display_name":"Sahil Dua","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dua, Sahil","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031406817","display_name":"Tanmaya Dabral","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dabral, Tanmaya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063732687","display_name":"Guangxing Han","orcid":"https://orcid.org/0000-0001-8307-8716"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Guangxing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127814512","display_name":"Bohyung Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Bohyung","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072605113","display_name":"Joshua Ainslie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ainslie, Joshua","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043765267","display_name":"Alex Bewley","orcid":"https://orcid.org/0000-0002-8428-9264"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bewley, Alex","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133641561","display_name":"Mithun Jacob","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jacob, Mithun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133675804","display_name":"Ren\u00e9 Wagner","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wagner, Ren\u00e9","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102970194","display_name":"Washington Ramos","orcid":"https://orcid.org/0000-0002-3311-7446"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ramos, Washington","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031842812","display_name":"Krzysztof Choroma\u0144ski","orcid":"https://orcid.org/0000-0003-3626-414X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Choromanski, Krzysztof","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053494387","display_name":"Mojtaba Seyedhosseini","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Seyedhosseini, Mojtaba","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056352820","display_name":"Howard Zhou","orcid":"https://orcid.org/0000-0003-3245-8481"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Howard","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133665390","display_name":"Andr\u00e9 Araujo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Araujo, Andr\u00e9","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":19,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9204000234603882,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9204000234603882,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.013399999588727951,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.012199999764561653,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.7263000011444092},{"id":"https://openalex.org/keywords/downstream","display_name":"Downstream (manufacturing)","score":0.5099999904632568},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.4918000102043152},{"id":"https://openalex.org/keywords/spotting","display_name":"Spotting","score":0.40389999747276306},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.39250001311302185},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.33970001339912415},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.33160001039505005}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7351999878883362},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.7263000011444092},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.609499990940094},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.5099999904632568},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.4918000102043152},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.48420000076293945},{"id":"https://openalex.org/C2779506182","wikidata":"https://www.wikidata.org/wiki/Q7580141","display_name":"Spotting","level":2,"score":0.40389999747276306},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.39250001311302185},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34209999442100525},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.33970001339912415},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.33160001039505005},{"id":"https://openalex.org/C37488316","wikidata":"https://www.wikidata.org/wiki/Q53699","display_name":"Rotary encoder","level":3,"score":0.3206000030040741},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3183000087738037},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.31700000166893005},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.30059999227523804},{"id":"https://openalex.org/C2780615140","wikidata":"https://www.wikidata.org/wiki/Q920419","display_name":"Upgrade","level":2,"score":0.3005000054836273},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2915000021457672},{"id":"https://openalex.org/C166704113","wikidata":"https://www.wikidata.org/wiki/Q861092","display_name":"Image registration","level":3,"score":0.26350000500679016},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.25360000133514404}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.12012","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12012","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.12012","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.12012","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.714072585105896,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"progress":[1],"in":[2,60,150],"vision-language":[3,62,138],"pretraining":[4,102,139],"has":[5],"enabled":[6],"significant":[7],"improvements":[8],"to":[9,56,98,101,106,111,124,136,160],"many":[10],"downstream":[11,187],"computer":[12],"vision":[13,210],"applications,":[14],"such":[15],"as":[16],"classification,":[17],"retrieval,":[18],"segmentation":[19],"and":[20,52,141,154,195,214],"depth":[21],"prediction.":[22],"However,":[23],"a":[24,68,156,174,183],"fundamental":[25],"capability":[26,59],"that":[27,67,89],"these":[28,169],"models":[29,180,215],"still":[30],"struggle":[31],"with":[32,38,205],"is":[33],"aligning":[34],"dense":[35,74],"patch":[36],"representations":[37],"text":[39],"embeddings":[40],"of":[41,82,90,132,177,186],"corresponding":[42],"concepts.":[43],"In":[44],"this":[45,49,58],"work,":[46],"we":[47,65,143,171,198],"investigate":[48],"critical":[50],"issue":[51],"propose":[53,107],"novel":[54],"techniques":[55],"enhance":[57],"foundational":[61],"models.":[63,134,212],"First,":[64],"reveal":[66],"patch-level":[69],"distillation":[70],"procedure":[71],"significantly":[72],"boosts":[73],"patch-text":[75,80,130],"alignment":[76,81,131],"--":[77],"surprisingly,":[78],"the":[79,83,91,112,125,145,151],"distilled":[84],"student":[85],"model":[86],"strongly":[87],"surpasses":[88],"teacher":[92],"model.":[93],"This":[94,127],"observation":[95],"inspires":[96],"us":[97,105],"consider":[99],"modifications":[100],"recipes,":[103],"leading":[104],"iBOT++,":[108],"an":[109],"upgrade":[110],"commonly-used":[113],"iBOT":[114],"masked":[115],"image":[116],"objective,":[117],"where":[118],"unmasked":[119],"tokens":[120],"also":[121],"contribute":[122],"directly":[123],"loss.":[126],"dramatically":[128],"enhances":[129],"pretrained":[133],"Additionally,":[135],"improve":[137],"efficiency":[140],"effectiveness,":[142],"modify":[144],"exponential":[146],"moving":[147],"average":[148],"setup":[149],"learning":[152],"recipe,":[153],"introduce":[155],"caption":[157],"sampling":[158],"strategy":[159],"benefit":[161],"from":[162],"synthetic":[163],"captions":[164],"at":[165,222],"different":[166],"granularities.":[167],"Combining":[168],"components,":[170],"develop":[172],"TIPSv2,":[173],"new":[175],"family":[176],"image-text":[178],"encoder":[179,211],"suitable":[181],"for":[182],"wide":[184],"range":[185],"applications.":[188],"Through":[189],"comprehensive":[190],"experiments":[191],"on":[192,203],"9":[193],"tasks":[194],"20":[196],"datasets,":[197],"demonstrate":[199],"strong":[200],"performance,":[201],"generally":[202],"par":[204],"or":[206],"better":[207],"than":[208],"recent":[209],"Code":[213],"are":[216],"released":[217],"via":[218],"our":[219],"project":[220],"page":[221],"https://gdm-tipsv2.github.io/":[223],".":[224]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-16T00:00:00"}
