{"id":"https://openalex.org/W7133498487","doi":"https://doi.org/10.48550/arxiv.2603.03276","title":"Beyond Language Modeling: An Exploration of Multimodal Pretraining","display_name":"Beyond Language Modeling: An Exploration of Multimodal Pretraining","publication_year":2026,"publication_date":"2026-03-03","ids":{"openalex":"https://openalex.org/W7133498487","doi":"https://doi.org/10.48550/arxiv.2603.03276"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.03276","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.03276","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.03276","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5055545928","display_name":"Shengbang Tong","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Tong, Shengbang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128069309","display_name":"David Fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, David","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128056424","display_name":"John Nguyen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nguyen, John","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128072176","display_name":"Ellis Brown","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brown, Ellis","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061988419","display_name":"Gaoyue Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Gaoyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030632256","display_name":"Shengyi Qian","orcid":"https://orcid.org/0000-0003-0262-2412"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qian, Shengyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128062467","display_name":"Boyang Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Boyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081869992","display_name":"Th\u00e9ophane Vallaeys","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vallaeys, Th\u00e9ophane","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128119585","display_name":"Junlin Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Junlin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089960673","display_name":"Rob Fergus","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fergus, Rob","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051137149","display_name":"Naila Murray","orcid":"https://orcid.org/0000-0001-7032-0403"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Murray, Naila","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128097575","display_name":"Marjan Ghazvininejad","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ghazvininejad, Marjan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128040059","display_name":"Mike Lewis","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lewis, Mike","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122561118","display_name":"Nicolas Ballas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ballas, Nicolas","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128069964","display_name":"Amir Bar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bar, Amir","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089819604","display_name":"Michael Rabbat","orcid":"https://orcid.org/0000-0003-0536-7904"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rabbat, Michael","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040312210","display_name":"Jakob Verbeek","orcid":"https://orcid.org/0000-0003-1419-1816"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Verbeek, Jakob","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128070386","display_name":"Luke Zettlemoyer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zettlemoyer, Luke","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109576489","display_name":"Koustuv Sinha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sinha, Koustuv","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113433465","display_name":"Yann Lecun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"LeCun, Yann","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128108459","display_name":"Saining Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Saining","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":21,"corresponding_author_ids":["https://openalex.org/A5055545928"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.960099995136261,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.960099995136261,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.009200000204145908,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.003599999938160181,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.550000011920929},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.48559999465942383},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.48420000076293945},{"id":"https://openalex.org/keywords/visual-language","display_name":"Visual language","score":0.46209999918937683},{"id":"https://openalex.org/keywords/clarity","display_name":"CLARITY","score":0.4560000002384186},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.37560001015663147},{"id":"https://openalex.org/keywords/multimodal-learning","display_name":"Multimodal learning","score":0.36910000443458557},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.36070001125335693},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3303999900817871}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6489999890327454},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.550000011920929},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.48559999465942383},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.48420000076293945},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.46209999918937683},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45750001072883606},{"id":"https://openalex.org/C2777146004","wikidata":"https://www.wikidata.org/wiki/Q14949826","display_name":"CLARITY","level":2,"score":0.4560000002384186},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.37560001015663147},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.36910000443458557},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.36070001125335693},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3303999900817871},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.31769999861717224},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.31610000133514404},{"id":"https://openalex.org/C2779019669","wikidata":"https://www.wikidata.org/wiki/Q25203946","display_name":"Asynchrony (computer programming)","level":3,"score":0.3000999987125397},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.2962000072002411},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.29330000281333923},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.2863999903202057},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2831000089645386},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2766999900341034},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2759000062942505},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.27390000224113464},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.27239999175071716},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.26190000772476196},{"id":"https://openalex.org/C522192633","wikidata":"https://www.wikidata.org/wiki/Q34228","display_name":"Sign language","level":2,"score":0.2581000030040741},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2540999948978424},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.25220000743865967},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.25130000710487366},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.03276","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.03276","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.03276","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.03276","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.6244173049926758,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"visual":[1,91,97,102],"world":[2,121],"offers":[3],"a":[4,156],"critical":[5],"axis":[6],"for":[7,22,57,61,111,151,195],"advancing":[8],"foundation":[9],"models":[10,25],"beyond":[11],"language.":[12,165],"Despite":[13],"growing":[14],"interest":[15],"in":[16],"this":[17,173],"direction,":[18],"the":[19,38,51,169,178,187,193],"design":[20],"space":[21],"native":[23],"multimodal":[24,42,116,137,198],"remains":[26],"opaque.":[27],"We":[28,49,166],"provide":[29],"empirical":[30],"clarity":[31],"through":[32],"controlled,":[33],"from-scratch":[34],"pretraining":[35,43,117],"experiments,":[36],"isolating":[37],"factors":[39],"that":[40,168],"govern":[41],"without":[44],"interference":[45],"from":[46,126],"language":[47,58,104,184],"pretraining.":[48],"adopt":[50],"Transfusion":[52],"framework,":[53],"using":[54],"next-token":[55],"prediction":[56],"and":[59,73,99,103,108,129,135,154],"diffusion":[60],"vision,":[62,191],"to":[63,120],"train":[64],"on":[65],"diverse":[66],"data":[67,105],"including":[68],"text,":[69],"video,":[70],"image-text":[71],"pairs,":[72],"even":[74],"action-conditioned":[75],"video.":[76],"Our":[77],"experiments":[78],"yield":[79,109],"four":[80],"key":[81],"insights:":[82],"(i)":[83],"Representation":[84],"Autoencoder":[85],"(RAE)":[86],"provides":[87],"an":[88],"optimal":[89],"unified":[90,115,197],"representation":[92],"by":[93,176,183],"excelling":[94],"at":[95],"both":[96,152],"understanding":[98],"generation;":[100],"(ii)":[101],"are":[106],"complementary":[107],"synergy":[110],"downstream":[112],"capabilities;":[113],"(iii)":[114],"leads":[118],"naturally":[119,140],"modeling,":[122],"with":[123],"capabilities":[124],"emerging":[125],"general":[127],"training;":[128],"(iv)":[130],"Mixture-of-Experts":[131],"(MoE)":[132],"enables":[133],"efficient":[134],"effective":[136],"scaling":[138,149,157,174],"while":[139,185],"inducing":[141],"modality":[142],"specialization.":[143],"Through":[144],"IsoFLOP":[145],"analysis,":[146],"we":[147],"compute":[148],"laws":[150],"modalities":[153],"uncover":[155],"asymmetry:":[158],"vision":[159],"is":[160],"significantly":[161],"more":[162],"data-hungry":[163],"than":[164],"demonstrate":[167],"MoE":[170],"architecture":[171],"harmonizes":[172],"asymmetry":[175],"providing":[177],"high":[179],"model":[180],"capacity":[181],"required":[182],"accommodating":[186],"data-intensive":[188],"nature":[189],"of":[190],"paving":[192],"way":[194],"truly":[196],"models.":[199]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-03-05T00:00:00"}
