{"id":"https://openalex.org/W7137837393","doi":"https://doi.org/10.48550/arxiv.2603.14482","title":"V-JEPA 2.1: Unlocking Dense Features in Video Self-Supervised Learning","display_name":"V-JEPA 2.1: Unlocking Dense Features in Video Self-Supervised Learning","publication_year":2026,"publication_date":"2026-03-15","ids":{"openalex":"https://openalex.org/W7137837393","doi":"https://doi.org/10.48550/arxiv.2603.14482"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.14482","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14482","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.14482","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001202571","display_name":"Lorenzo Mur-Labadia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mur-Labadia, Lorenzo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029739727","display_name":"Matthew J. Muckley","orcid":"https://orcid.org/0000-0002-6525-8817"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Muckley, Matthew","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129677399","display_name":"Amir Bar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bar, Amir","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129674900","display_name":"Mido Assran","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Assran, Mido","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109576489","display_name":"Koustuv Sinha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sinha, Koustuv","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129718124","display_name":"Mike Rabbat","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rabbat, Mike","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129743562","display_name":"Yann LeCun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"LeCun, Yann","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122561118","display_name":"Nicolas Ballas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ballas, Nicolas","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129683442","display_name":"Adrien Bardes","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bardes, Adrien","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.30489999055862427,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.30489999055862427,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.2126999944448471,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.12070000171661377,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5943999886512756},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5491999983787537},{"id":"https://openalex.org/keywords/anticipation","display_name":"Anticipation (artificial intelligence)","score":0.46889999508857727},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.4684000015258789},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.460999995470047},{"id":"https://openalex.org/keywords/action-recognition","display_name":"Action recognition","score":0.4505999982357025},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.4424000084400177},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4059999883174896}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7695000171661377},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7105000019073486},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5943999886512756},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5491999983787537},{"id":"https://openalex.org/C176777502","wikidata":"https://www.wikidata.org/wiki/Q4774623","display_name":"Anticipation (artificial intelligence)","level":2,"score":0.46889999508857727},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.4684000015258789},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.460999995470047},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.4505999982357025},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.4424000084400177},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.43050000071525574},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4059999883174896},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.396699994802475},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.3779999911785126},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3765000104904175},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.33469998836517334},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.31610000133514404},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.31470000743865967},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.30820000171661377},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.2904999852180481},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.26820001006126404},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2678000032901764},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2612000107765198},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.26100000739097595},{"id":"https://openalex.org/C163175372","wikidata":"https://www.wikidata.org/wiki/Q3339222","display_name":"Linear model","level":2,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.14482","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14482","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.14482","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14482","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"present":[1],"V-JEPA":[2,116,190],"2.1,":[3],"a":[4,33,38,146,177],"family":[5],"of":[6,196],"self-supervised":[7,64],"models":[8],"that":[9,106,189],"learn":[10],"dense,":[11],"high-quality":[12],"visual":[13,201],"representations":[14,105],"for":[15,130,139],"both":[16,43,94],"images":[17,83],"and":[18,45,56,84,97,112,134,180,203],"videos":[19],"while":[20],"retaining":[21],"strong":[22,161],"global":[23,181],"scene":[24],"understanding.":[25],"The":[26,157],"approach":[27],"combines":[28],"four":[29],"key":[30],"components.":[31],"First,":[32],"dense":[34,200],"predictive":[35],"loss":[36],"uses":[37],"masking-based":[39],"objective":[40,65],"in":[41,93,149,163,199],"which":[42],"visible":[44],"masked":[46],"tokens":[47],"contribute":[48],"to":[49,72],"the":[50,63,87,194,197],"training":[51,81,98],"signal,":[52],"encouraging":[53],"explicit":[54],"spatial":[55],"temporal":[57],"grounding.":[58],"Second,":[59],"deep":[60],"self-supervision":[61],"applies":[62],"hierarchically":[66],"across":[67,82],"multiple":[68],"intermediate":[69],"encoder":[70],"layers":[71],"improve":[73],"representation":[74],"quality.":[75],"Third,":[76],"multi-modal":[77],"tokenizers":[78],"enable":[79],"unified":[80],"videos.":[85],"Finally,":[86],"model":[88,95,158],"benefits":[89],"from":[90],"effective":[91],"scaling":[92],"capacity":[96],"data.":[99],"Together,":[100],"these":[101],"design":[102],"choices":[103],"produce":[104],"are":[107],"spatially":[108],"structured,":[109],"semantically":[110],"coherent,":[111],"temporally":[113],"consistent.":[114],"Empirically,":[115],"2.1":[117,191],"achieves":[118],"state-of-the-art":[119],"performance":[120,162],"on":[121,128,137,168,174,184],"several":[122],"challenging":[123],"benchmarks,":[124],"including":[125],"7.71":[126],"mAP":[127],"Ego4D":[129],"short-term":[131],"object-interaction":[132],"anticipation":[133],"40.8":[135],"Recall@5":[136],"EPIC-KITCHENS":[138],"high-level":[140],"action":[141],"anticipation,":[142],"as":[143,145],"well":[144],"20-point":[147],"improvement":[148],"real-robot":[150],"grasping":[151],"success":[152],"rate":[153],"over":[154],"V-JEPA-2":[155],"AC.":[156],"also":[159],"demonstrates":[160],"robotic":[164],"navigation":[165],"(5.687":[166],"ATE":[167],"TartanDrive),":[169],"depth":[170],"estimation":[171],"(0.307":[172],"RMSE":[173],"NYUv2":[175],"with":[176],"linear":[178],"probe),":[179],"recognition":[182],"(77.7":[183],"Something-Something-V2).":[185],"These":[186],"results":[187],"show":[188],"significantly":[192],"advances":[193],"state":[195],"art":[198],"understanding":[202],"world":[204],"modeling.":[205]},"counts_by_year":[],"updated_date":"2026-06-13T06:13:01.061226","created_date":"2026-03-18T00:00:00"}
