{"id":"https://openalex.org/W7155397894","doi":"https://doi.org/10.48550/arxiv.2604.20329","title":"Image Generators are Generalist Vision Learners","display_name":"Image Generators are Generalist Vision Learners","publication_year":2026,"publication_date":"2026-04-22","ids":{"openalex":"https://openalex.org/W7155397894","doi":"https://doi.org/10.48550/arxiv.2604.20329"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.20329","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20329","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.20329","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001789528","display_name":"Valentin Gabeur","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gabeur, Valentin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024174755","display_name":"Shangbang Long","orcid":"https://orcid.org/0000-0002-4089-5369"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Long, Shangbang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134422680","display_name":"Songyou Peng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Songyou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041202059","display_name":"Paul Voigtlaender","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Voigtlaender, Paul","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134449220","display_name":"Shuyang Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Shuyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134432141","display_name":"Yanan Bao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bao, Yanan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087186917","display_name":"Karen Truong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Truong, Karen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134459017","display_name":"Zhicheng Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhicheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058224708","display_name":"Wenlei Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Wenlei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024117395","display_name":"Jonathan T. Barron","orcid":"https://orcid.org/0009-0008-4016-9448"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Barron, Jonathan T.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060906167","display_name":"Kyle Genova","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Genova, Kyle","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134398088","display_name":"Nithish Kannen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kannen, Nithish","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111322490","display_name":"Sherry Ben","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ben, Sherry","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134405319","display_name":"Yandong Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yandong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004864409","display_name":"Mandy Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Mandy","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134366980","display_name":"Suhas Yogin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yogin, Suhas","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102216183","display_name":"Yiming Gu","orcid":"https://orcid.org/0009-0009-4077-096X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Yiming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134365865","display_name":"Huizhong Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Huizhong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074776688","display_name":"Oliver Wang","orcid":"https://orcid.org/0000-0001-6240-1342"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Oliver","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134441876","display_name":"Saining Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Saining","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134419349","display_name":"Howard Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Howard","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100700361","display_name":"Kaiming He","orcid":"https://orcid.org/0000-0001-7318-9658"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Kaiming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134445139","display_name":"Thomas Funkhouser","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Funkhouser, Thomas","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012826868","display_name":"Jean-Baptiste Alayrac","orcid":"https://orcid.org/0000-0002-3071-4157"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Alayrac, Jean-Baptiste","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5089152779","display_name":"Radu Soricut","orcid":"https://orcid.org/0000-0003-1565-3365"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Soricut, Radu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":25,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.29420000314712524,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.29420000314712524,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11148","display_name":"Language, Metaphor, and Cognition","score":0.04740000143647194,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12090","display_name":"Language and cultural evolution","score":0.047200001776218414,"subfield":{"id":"https://openalex.org/subfields/3316","display_name":"Cultural Studies"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/vision-science","display_name":"Vision science","score":0.5220000147819519},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.4986000061035156},{"id":"https://openalex.org/keywords/machine-vision","display_name":"Machine vision","score":0.4643000066280365},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.44130000472068787},{"id":"https://openalex.org/keywords/generalist-and-specialist-species","display_name":"Generalist and specialist species","score":0.4237000048160553},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.42149999737739563},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.41429999470710754},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.40849998593330383}],"concepts":[{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6470999717712402},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6304000020027161},{"id":"https://openalex.org/C200220432","wikidata":"https://www.wikidata.org/wiki/Q7936208","display_name":"Vision science","level":2,"score":0.5220000147819519},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.4986000061035156},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4708999991416931},{"id":"https://openalex.org/C5339829","wikidata":"https://www.wikidata.org/wiki/Q1425977","display_name":"Machine vision","level":2,"score":0.4643000066280365},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.44130000472068787},{"id":"https://openalex.org/C45371612","wikidata":"https://www.wikidata.org/wiki/Q3058587","display_name":"Generalist and specialist species","level":3,"score":0.4237000048160553},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.42149999737739563},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.41429999470710754},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.40849998593330383},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4007999897003174},{"id":"https://openalex.org/C193611912","wikidata":"https://www.wikidata.org/wiki/Q4677596","display_name":"Active vision","level":2,"score":0.36730000376701355},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3476000130176544},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3296000063419342},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.3179999887943268},{"id":"https://openalex.org/C2777655017","wikidata":"https://www.wikidata.org/wiki/Q1501161","display_name":"Toolbox","level":2,"score":0.30160000920295715},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.2939000129699707},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2824999988079071},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2770000100135803},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.27000001072883606},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.2572000026702881},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.2563999891281128}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.20329","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20329","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.20329","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.20329","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.7144826054573059}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"works":[1],"show":[2,188],"that":[3,37,55,69,89,189,210,222],"image":[4,70,143,203,211,223],"and":[5,26,80,85,162,178,229,243,272],"video":[6],"generators":[7],"exhibit":[8],"zero-shot":[9,168],"visual":[10,42,87],"understanding":[11,25,62,242],"behaviors,":[12],"in":[13,240,264],"a":[14,74,101,112,120,154,215,227,249,261],"way":[15],"reminiscent":[16],"of":[17,23,114,123,132,156],"how":[18],"LLMs":[19],"develop":[20],"emergent":[21],"capabilities":[22],"language":[24,241],"reasoning":[27],"from":[28],"generative":[29,56,257],"pretraining.":[30],"While":[31],"it":[32],"has":[33,51],"long":[34],"been":[35,52],"conjectured":[36],"the":[38,129,179,200],"ability":[39,46],"to":[40,47,77,236],"create":[41],"content":[43],"implies":[44],"an":[45],"understand":[48],"it,":[49],"there":[50],"limited":[53],"evidence":[54],"vision":[57,95,124,133,157,217,233,258],"models":[58,82],"have":[59],"developed":[60],"strong":[61],"capabilities.":[63,205],"In":[64],"this":[65],"work,":[66],"we":[67,138],"demonstrate":[68],"generation":[71,204,212,224,271],"training":[72,117],"serves":[73,225],"role":[75,239,263],"similar":[76,235],"LLM":[78],"pretraining,":[79],"lets":[81],"learn":[83],"powerful":[84],"general":[86],"representations":[88],"enable":[90],"SOTA":[91,151],"performance":[92],"on":[93,111,153,175,183],"various":[94],"tasks.":[96],"We":[97,187,245],"introduce":[98],"Vision":[99,148,267],"Banana,":[100,149],"generalist":[102,146,216],"model":[103],"built":[104],"by":[105],"instruction-tuning":[106,197],"Nano":[107],"Banana":[108],"Pro":[109],"(NBP)":[110],"mixture":[113],"its":[115],"original":[116],"data":[118],"alongside":[119],"small":[121],"amount":[122],"task":[125],"data.":[126],"By":[127],"parameterizing":[128],"output":[130],"space":[131],"tasks":[134,158],"as":[135,142,226],"RGB":[136],"images,":[137],"seamlessly":[139],"reframe":[140],"perception":[141],"generation.":[144],"Our":[145],"model,":[147],"achieves":[150],"results":[152,191,208],"variety":[155],"involving":[159],"both":[160,270],"2D":[161],"3D":[163],"understanding,":[164],"beating":[165],"or":[166],"rivaling":[167],"domain-specialists,":[169],"including":[170],"Segment":[171],"Anything":[172,181],"Model":[173],"3":[174],"segmentation":[176],"tasks,":[177,234],"Depth":[180],"series":[182],"metric":[184],"depth":[185],"estimation.":[186],"these":[190],"can":[192],"be":[193,247],"achieved":[194],"with":[195],"lightweight":[196],"without":[198],"sacrificing":[199],"base":[201],"model's":[202],"The":[206],"superior":[207],"suggest":[209],"pretraining":[213,259],"is":[214],"learner.":[218],"It":[219],"also":[220],"shows":[221],"unified":[228],"universal":[230],"interface":[231],"for":[232,253,269],"text":[237],"generation's":[238],"reasoning.":[244],"could":[246],"witnessing":[248],"major":[250],"paradigm":[251],"shift":[252],"computer":[254],"vision,":[255],"where":[256],"takes":[260],"central":[262],"building":[265],"Foundational":[266],"Models":[268],"understanding.":[273]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-24T00:00:00"}
