{"id":"https://openalex.org/W4399695180","doi":"https://doi.org/10.48550/arxiv.2406.08707","title":"mOSCAR: A Large-scale Multilingual and Multimodal Document-level Corpus","display_name":"mOSCAR: A Large-scale Multilingual and Multimodal Document-level Corpus","publication_year":2024,"publication_date":"2024-06-13","ids":{"openalex":"https://openalex.org/W4399695180","doi":"https://doi.org/10.48550/arxiv.2406.08707"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2406.08707","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.08707","pdf_url":"https://arxiv.org/pdf/2406.08707","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2406.08707","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5081921172","display_name":"Matthieu Futeral","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Futeral, Matthieu","raw_affiliation_strings":["ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)","WILLOW - Models of visual object recognition and scene understanding (France)"],"affiliations":[{"raw_affiliation_string":"ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)","institution_ids":[]},{"raw_affiliation_string":"WILLOW - Models of visual object recognition and scene understanding (France)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091926857","display_name":"Armel Randy Zebaze","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zebaze, Armel","raw_affiliation_strings":["ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)"],"affiliations":[{"raw_affiliation_string":"ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054913492","display_name":"Pedro Ortiz Su\u00e1rez","orcid":"https://orcid.org/0000-0003-0343-8852"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Suarez, Pedro Ortiz","raw_affiliation_strings":["Common Crawl (United States)"],"affiliations":[{"raw_affiliation_string":"Common Crawl (United States)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033576915","display_name":"Julien Abadji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abadji, Julien","raw_affiliation_strings":["ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)"],"affiliations":[{"raw_affiliation_string":"ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109961869","display_name":"R\u00e9mi Lacroix","orcid":null},"institutions":[{"id":"https://openalex.org/I103562704","display_name":"ESI Group (France)","ror":"https://ror.org/05ww4z228","country_code":"FR","type":"company","lineage":["https://openalex.org/I103562704"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Lacroix, R\u00e9mi","raw_affiliation_strings":["ESI-France [Lyon] (Immeuble Le R\u00e9camier, 70 rue Robert, 69006 Lyon - France)"],"affiliations":[{"raw_affiliation_string":"ESI-France [Lyon] (Immeuble Le R\u00e9camier, 70 rue Robert, 69006 Lyon - France)","institution_ids":["https://openalex.org/I103562704"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109890544","display_name":"Cordelia Schmid","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schmid, Cordelia","raw_affiliation_strings":["WILLOW - Models of visual object recognition and scene understanding (France)"],"affiliations":[{"raw_affiliation_string":"WILLOW - Models of visual object recognition and scene understanding (France)","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046431394","display_name":"Rachel Bawden","orcid":"https://orcid.org/0000-0001-9553-1768"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bawden, Rachel","raw_affiliation_strings":["ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)"],"affiliations":[{"raw_affiliation_string":"ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5077663332","display_name":"Beno\u00eet Sagot","orcid":"https://orcid.org/0000-0002-0107-8526"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sagot, Beno\u00eet","raw_affiliation_strings":["ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)"],"affiliations":[{"raw_affiliation_string":"ALMAnaCH - Automatic Language Modelling and ANAlysis & Computational Humanities (France)","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5081921172"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9945999979972839,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9941999912261963,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.909846305847168},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8410707712173462},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6043580770492554},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5880435705184937},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5724732875823975},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.5234588980674744},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.4768082797527313},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4734860360622406},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.42765697836875916},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.362570196390152},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.27991151809692383}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.909846305847168},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8410707712173462},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6043580770492554},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5880435705184937},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5724732875823975},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.5234588980674744},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.4768082797527313},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4734860360622406},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.42765697836875916},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.362570196390152},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.27991151809692383},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2406.08707","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.08707","pdf_url":"https://arxiv.org/pdf/2406.08707","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2406.08707","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2406.08707","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2406.08707","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.08707","pdf_url":"https://arxiv.org/pdf/2406.08707","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.6899999976158142}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4399695180.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W2775506363","https://openalex.org/W3088136942","https://openalex.org/W4290852288","https://openalex.org/W2949362007","https://openalex.org/W4388893791","https://openalex.org/W4283207562","https://openalex.org/W4287644835","https://openalex.org/W3092281475","https://openalex.org/W3098003361"],"abstract_inverted_index":{"Multimodal":[0],"Large":[1],"Language":[2],"Models":[3],"(mLLMs)":[4],"are":[5,17,74,83],"trained":[6,18,180,193,201],"on":[7,19,32,181,194,202],"a":[8,144,178,182,191,205],"large":[9],"amount":[10],"of":[11,35,44,86,115,146,160,168,175,184],"text-image":[12],"data.":[13,94],"While":[14],"most":[15],"mLLMs":[16],"caption-like":[20,87],"data":[21,188,196],"only,":[22],"Alayrac":[23],"et":[24],"al.":[25],"(2022)":[26],"showed":[27],"that":[28],"additionally":[29,164,200],"training":[30],"them":[31],"interleaved":[33],"sequences":[34],"text":[36],"and":[37,57,80,122,138,148,159,186,189,217,237],"images":[38],"can":[39,238],"lead":[40],"to":[41,66,112,151,171],"the":[42,49,71,100,106,113,118,128,173,230],"emergence":[43],"in-context":[45],"learning":[46,210],"capabilities.":[47],"However,":[48],"dataset":[50,226],"they":[51],"used,":[52],"M3W,":[53],"is":[54,58,155,227],"not":[55],"public":[56],"only":[59,88],"in":[60,105,208],"English.":[61],"There":[62],"have":[63],"been":[64],"attempts":[65],"reproduce":[67],"their":[68],"results":[69],"but":[70],"released":[72,228],"datasets":[73,82],"English-only.":[75],"In":[76],"contrast,":[77],"current":[78],"multilingual":[79,121,169,214],"multimodal":[81,123],"either":[84],"composed":[85],"or":[89,91],"medium-scale":[90],"fully":[92],"private":[93],"This":[95],"limits":[96],"mLLM":[97],"research":[98],"for":[99,222],"7,000":[101],"other":[102],"languages":[103],"spoken":[104],"world.":[107],"We":[108,141,163],"therefore":[109],"introduce":[110],"mOSCAR,":[111],"best":[114],"our":[116],"knowledge":[117],"first":[119],"large-scale":[120],"document":[124],"corpus":[125],"crawled":[126],"from":[127],"web.":[129],"It":[130],"covers":[131],"163":[132],"languages,":[133],"303M":[134],"documents,":[135],"200B":[136],"tokens":[137],"1.15B":[139],"images.":[140],"carefully":[142],"conduct":[143],"set":[145],"filtering":[147],"evaluation":[149],"steps":[150],"make":[152],"sure":[153],"mOSCAR":[154,185,203],"sufficiently":[156],"safe,":[157],"diverse":[158],"good":[161],"quality.":[162],"train":[165],"two":[166],"types":[167],"model":[170,179,192,199],"prove":[172],"benefits":[174],"mOSCAR:":[176],"(1)":[177],"subset":[183],"captioning":[187,195],"(2)":[190],"only.":[197],"The":[198,225],"shows":[204],"strong":[206],"boost":[207],"few-shot":[209],"performance":[211],"across":[212],"various":[213],"image-text":[215],"tasks":[216],"benchmarks,":[218],"confirming":[219],"previous":[220],"findings":[221],"English-only":[223],"mLLMs.":[224],"under":[229],"Creative":[231],"Commons":[232],"CC":[233],"BY":[234],"4.0":[235],"license":[236],"be":[239],"accessed":[240],"here:":[241],"https://huggingface.co/datasets/oscar-corpus/mOSCAR":[242]},"counts_by_year":[],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2025-10-10T00:00:00"}
