{"id":"https://openalex.org/W4399836557","doi":"https://doi.org/10.48550/arxiv.2406.11794","title":"DataComp-LM: In search of the next generation of training sets for language models","display_name":"DataComp-LM: In search of the next generation of training sets for language models","publication_year":2024,"publication_date":"2024-06-17","ids":{"openalex":"https://openalex.org/W4399836557","doi":"https://doi.org/10.48550/arxiv.2406.11794"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2406.11794","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.11794","pdf_url":"https://arxiv.org/pdf/2406.11794","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2406.11794","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5111296494","display_name":"Jeffrey Li","orcid":"https://orcid.org/0009-0002-4393-5773"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Jeffrey","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029444496","display_name":"Alex Chengyu Fang","orcid":"https://orcid.org/0000-0001-6651-1709"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fang, Alex","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076496455","display_name":"Georgios Smyrnis","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Smyrnis, Georgios","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041579054","display_name":"Maor Ivgi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ivgi, Maor","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039535109","display_name":"Matt Jordan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jordan, Matt","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039046571","display_name":"Samir Yitzhak Gadre","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gadre, Samir","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112428050","display_name":"Hritik Bansal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bansal, Hritik","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042004933","display_name":"Etash Guha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guha, Etash","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052690536","display_name":"Sedrick Scott Keh","orcid":"https://orcid.org/0000-0003-0088-7951"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Keh, Sedrick","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007392934","display_name":"Kushal Arora","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Arora, Kushal","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013136179","display_name":"Saurabh Garg","orcid":"https://orcid.org/0000-0001-8719-284X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Garg, Saurabh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101888897","display_name":"Rui Xin","orcid":"https://orcid.org/0000-0002-0224-3431"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xin, Rui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000043237","display_name":"Niklas Muennighoff","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Muennighoff, Niklas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003606899","display_name":"Reinhard Heckel","orcid":"https://orcid.org/0000-0002-2874-2984"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Heckel, Reinhard","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002911029","display_name":"Jean Mercat","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mercat, Jean","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039665431","display_name":"Mayee Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Mayee","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075783850","display_name":"Suchin Gururangan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gururangan, Suchin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001590363","display_name":"Mitchell Wortsman","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wortsman, Mitchell","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090026363","display_name":"Alon Albalak","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Albalak, Alon","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068580969","display_name":"Yonatan Bitton","orcid":"https://orcid.org/0000-0002-1185-6838"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bitton, Yonatan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042252873","display_name":"Marianna Nezhurina","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nezhurina, Marianna","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102486881","display_name":"Amro Abbas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abbas, Amro","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020968560","display_name":"Cheng-Yu Hsieh","orcid":"https://orcid.org/0000-0002-2826-1435"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hsieh, Cheng-Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069431630","display_name":"Dhruba Ghosh","orcid":"https://orcid.org/0000-0002-8518-2696"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ghosh, Dhruba","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110941464","display_name":"Josh Gardner","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gardner, Josh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5098836789","display_name":"Maciej Kilian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kilian, Maciej","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101598769","display_name":"Hanlin Zhang","orcid":"https://orcid.org/0000-0001-8869-6863"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Hanlin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047742348","display_name":"Rulin Shao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shao, Rulin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088415601","display_name":"Sarah I. Pratt","orcid":"https://orcid.org/0000-0002-5450-9334"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pratt, Sarah","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102521226","display_name":"Sunny Sanyal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sanyal, Sunny","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068360032","display_name":"Gabriel Ilharco","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ilharco, Gabriel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005479839","display_name":"Giannis Daras","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Daras, Giannis","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111234364","display_name":"Kalyani Marathe","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Marathe, Kalyani","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029160764","display_name":"Aaron Gokaslan","orcid":"https://orcid.org/0000-0002-3575-2961"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gokaslan, Aaron","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101518688","display_name":"Jieyu Zhang","orcid":"https://orcid.org/0000-0002-4625-3359"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jieyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007169600","display_name":"Khyathi Raghavi Chandu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chandu, Khyathi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102926742","display_name":"Thao Nguyen","orcid":"https://orcid.org/0000-0002-5016-2230"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nguyen, Thao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042694706","display_name":"Igor Vasiljevic","orcid":"https://orcid.org/0000-0001-6117-0693"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vasiljevic, Igor","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108381794","display_name":"Sham M. Kakade","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kakade, Sham","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004644695","display_name":"Shuran Song","orcid":"https://orcid.org/0000-0002-8768-7356"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Shuran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110619770","display_name":"Sujay Sanghavi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sanghavi, Sujay","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036601505","display_name":"Fartash Faghri","orcid":"https://orcid.org/0000-0001-5975-5158"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Faghri, Fartash","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028243041","display_name":"Sewoong Oh","orcid":"https://orcid.org/0000-0002-8975-8306"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Oh, Sewoong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067919401","display_name":"Luke Zettlemoyer","orcid":"https://orcid.org/0009-0008-8296-0764"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zettlemoyer, Luke","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066588555","display_name":"Kyle Lo","orcid":"https://orcid.org/0000-0002-1804-2853"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lo, Kyle","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083505661","display_name":"Alaaeldin El-Nouby","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"El-Nouby, Alaaeldin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059295598","display_name":"Hadi Pouransari","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pouransari, Hadi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064265174","display_name":"Alexander Toshev","orcid":"https://orcid.org/0000-0003-0925-638X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Toshev, Alexander","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076479697","display_name":"Stephanie Wang","orcid":"https://orcid.org/0000-0001-5960-5491"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Stephanie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059265033","display_name":"Dirk Groeneveld","orcid":"https://orcid.org/0000-0002-8274-768X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Groeneveld, Dirk","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060844217","display_name":"Luca Soldaini","orcid":"https://orcid.org/0000-0001-6998-9863"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Soldaini, Luca","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079904764","display_name":"Pang Wei Koh","orcid":"https://orcid.org/0000-0003-4330-6969"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Koh, Pang Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085870953","display_name":"Jenia Jitsev","orcid":"https://orcid.org/0000-0002-1221-7851"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jitsev, Jenia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073644080","display_name":"Thomas Kollar","orcid":"https://orcid.org/0000-0003-2598-8118"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kollar, Thomas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014141359","display_name":"Alexandros G. Dimakis","orcid":"https://orcid.org/0000-0002-4244-7033"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dimakis, Alexandros G.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035869365","display_name":"Yair Carmon","orcid":"https://orcid.org/0000-0001-5731-8640"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Carmon, Yair","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045047925","display_name":"Achal Dave","orcid":"https://orcid.org/0000-0003-1948-5629"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dave, Achal","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111490184","display_name":"Ludwig Schmidt","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schmidt, Ludwig","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5112327867","display_name":"Vaishaal Shankar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shankar, Vaishaal","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":59,"corresponding_author_ids":["https://openalex.org/A5111296494"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":7,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9857000112533569,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9857000112533569,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9815000295639038,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.6983694434165955},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5695882439613342},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4279702603816986},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4010379910469055},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3954727053642273},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.0981738269329071}],"concepts":[{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.6983694434165955},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5695882439613342},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4279702603816986},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4010379910469055},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3954727053642273},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0981738269329071},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2406.11794","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.11794","pdf_url":"https://arxiv.org/pdf/2406.11794","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2406.11794","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2406.11794","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2406.11794","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.11794","pdf_url":"https://arxiv.org/pdf/2406.11794","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1523888516","display_name":null,"funder_award_id":"FA9550-","funder_id":"https://openalex.org/F4320338279","funder_display_name":"Air Force Office of Scientific Research"},{"id":"https://openalex.org/G1557846000","display_name":"RINGS:Deep Generative Models for Ultra High-Dimensional Next Generation Communication Systems","funder_award_id":"2148141","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G1587741752","display_name":null,"funder_award_id":"AF 1901292","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G1620434043","display_name":null,"funder_award_id":"JUWELS","funder_id":"https://openalex.org/F4320331625","funder_display_name":"Gauss Centre for Supercomputing"},{"id":"https://openalex.org/G2066136133","display_name":null,"funder_award_id":"193493","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G3300713811","display_name":null,"funder_award_id":"2486/21","funder_id":"https://openalex.org/F4320322252","funder_display_name":"Israel Science Foundation"},{"id":"https://openalex.org/G3367813468","display_name":null,"funder_award_id":"FA9550-22-1-0380","funder_id":"https://openalex.org/F4320333591","funder_display_name":"Multidisciplinary University Research Initiative"},{"id":"https://openalex.org/G5809100787","display_name":null,"funder_award_id":"FA9550","funder_id":"https://openalex.org/F4320338279","funder_display_name":"Air Force Office of Scientific Research"},{"id":"https://openalex.org/G6048779419","display_name":null,"funder_award_id":"FA9550-22-1-0380","funder_id":"https://openalex.org/F4320338279","funder_display_name":"Air Force Office of Scientific Research"},{"id":"https://openalex.org/G6347559086","display_name":null,"funder_award_id":"CCF 2019844","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8722843727","display_name":null,"funder_award_id":"1934932","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320307791","display_name":"Cisco Systems","ror":"https://ror.org/03yt1ez60"},{"id":"https://openalex.org/F4320307793","display_name":"Western Digital","ror":"https://ror.org/02hqwnx33"},{"id":"https://openalex.org/F4320310620","display_name":"University of Texas at Austin","ror":"https://ror.org/00hj54h04"},{"id":"https://openalex.org/F4320315389","display_name":"Open Philanthropy Project","ror":"https://ror.org/004d1k391"},{"id":"https://openalex.org/F4320317052","display_name":"Allen Institute for Artificial Intelligence","ror":"https://ror.org/05w520734"},{"id":"https://openalex.org/F4320321114","display_name":"Bundesministerium f\u00fcr Bildung und Forschung","ror":"https://ror.org/04pz7b180"},{"id":"https://openalex.org/F4320322252","display_name":"Israel Science Foundation","ror":"https://ror.org/04sazxf24"},{"id":"https://openalex.org/F4320323220","display_name":"Alexander S. Onassis Public Benefit Foundation","ror":"https://ror.org/017nq0d63"},{"id":"https://openalex.org/F4320323278","display_name":"Council for Higher Education","ror":"https://ror.org/036mqp197"},{"id":"https://openalex.org/F4320331625","display_name":"Gauss Centre for Supercomputing","ror":"https://ror.org/0585fsj26"},{"id":"https://openalex.org/F4320333591","display_name":"Multidisciplinary University Research Initiative","ror":null},{"id":"https://openalex.org/F4320338279","display_name":"Air Force Office of Scientific Research","ror":"https://ror.org/011e9bt93"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4399836557.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W230091440","https://openalex.org/W2390279801","https://openalex.org/W2233261550","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2810751659","https://openalex.org/W258997015","https://openalex.org/W3204019825"],"abstract_inverted_index":{"We":[0],"introduce":[1],"DataComp":[2],"for":[3,9,82,198,207],"Language":[4],"Models":[5],"(DCLM),":[6],"a":[7,26,45,80,97,107,136,204],"testbed":[8],"controlled":[10],"dataset":[11,196],"experiments":[12,87],"with":[13,59,120,146,182],"the":[14,41,54,127,193],"goal":[15],"of":[16,22,29,48,173,195],"improving":[17],"language":[18,110,132,176,200],"models.":[19],"As":[20,79],"part":[21],"DCLM,":[23,83],"we":[24,84],"provide":[25],"standardized":[27],"corpus":[28],"240T":[30],"tokens":[31],"extracted":[32],"from":[33,74,112],"Common":[34],"Crawl,":[35],"effective":[36],"pretraining":[37],"recipes":[38],"based":[39],"on":[40,118,141,162,170,210],"OpenLM":[42],"framework,":[43],"and":[44,67,88,158,167,202],"broad":[46],"suite":[47],"53":[49,174],"downstream":[50],"evaluations.":[51],"Participants":[52],"in":[53,130],"DCLM":[55],"benchmark":[56],"can":[57],"experiment":[58],"data":[60,68,211],"curation":[61],"strategies":[62],"such":[63],"as":[64],"deduplication,":[65],"filtering,":[66],"mixing":[69],"at":[70],"model":[71,111,152],"scales":[72],"ranging":[73],"412M":[75],"to":[76,95,114,125,156],"7B":[77,108],"parameters.":[78],"baseline":[81,151],"conduct":[85],"extensive":[86],"find":[89],"that":[90],"model-based":[91],"filtering":[92],"is":[93,153],"key":[94],"assembling":[96],"high-quality":[98],"training":[99,106,122,199],"set.":[100],"The":[101],"resulting":[102],"dataset,":[103],"DCLM-Baseline":[104,134],"enables":[105],"parameter":[109],"scratch":[113],"64%":[115],"5-shot":[116],"accuracy":[117],"MMLU":[119,142,163],"2.6T":[121],"tokens.":[123],"Compared":[124],"MAP-Neo,":[126],"previous":[128],"state-of-the-art":[129],"open-data":[131],"models,":[133],"represents":[135],"6.6":[137],"percentage":[138],"point":[139,206],"improvement":[140],"while":[143,179],"being":[144,180],"trained":[145,181],"40%":[147],"less":[148,184],"compute.":[149],"Our":[150,190],"also":[154],"comparable":[155],"Mistral-7B-v0.3":[157],"Llama":[159,187],"3":[160,188],"8B":[161],"(63%":[164],"&amp;":[165],"66%),":[166],"performs":[168],"similarly":[169],"an":[171],"average":[172],"natural":[175],"understanding":[177],"tasks":[178],"6.6x":[183],"compute":[185],"than":[186],"8B.":[189],"results":[191],"highlight":[192],"importance":[194],"design":[197],"models":[201],"offer":[203],"starting":[205],"further":[208],"research":[209],"curation.":[212]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":2}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
