{"id":"https://openalex.org/W4391514311","doi":"https://doi.org/10.48550/arxiv.2402.00159","title":"Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research","display_name":"Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research","publication_year":2024,"publication_date":"2024-01-31","ids":{"openalex":"https://openalex.org/W4391514311","doi":"https://doi.org/10.48550/arxiv.2402.00159"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2402.00159","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.00159","pdf_url":"https://arxiv.org/pdf/2402.00159","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2402.00159","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5060844217","display_name":"Luca Soldaini","orcid":"https://orcid.org/0000-0001-6998-9863"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Soldaini, Luca","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056105462","display_name":"Rodney Kinney","orcid":"https://orcid.org/0000-0002-3582-6537"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kinney, Rodney","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041892782","display_name":"Akshita Bhagia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bhagia, Akshita","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007645100","display_name":"Dustin Schwenk","orcid":"https://orcid.org/0000-0003-1666-6889"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schwenk, Dustin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041721032","display_name":"David Atkinson","orcid":"https://orcid.org/0000-0003-1124-6666"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Atkinson, David","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083280536","display_name":"Russell Authur","orcid":"https://orcid.org/0009-0002-1238-4323"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Authur, Russell","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051169643","display_name":"Ben Bogin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bogin, Ben","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007169600","display_name":"Khyathi Raghavi Chandu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chandu, Khyathi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001761515","display_name":"Jennifer Dumas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dumas, Jennifer","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081462507","display_name":"Yanai Elazar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Elazar, Yanai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014923355","display_name":"Valentin Hofmann","orcid":"https://orcid.org/0000-0001-6603-3428"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hofmann, Valentin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073589706","display_name":"Ananya Harsh Jha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jha, Ananya Harsh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032948428","display_name":"Sachin Kumar","orcid":"https://orcid.org/0000-0003-3949-0302"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kumar, Sachin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072860546","display_name":"Li Lucy","orcid":"https://orcid.org/0009-0002-8077-6310"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lucy, Li","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009648800","display_name":"Xinxi Lyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyu, Xinxi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083548887","display_name":"Nathan Lambert","orcid":"https://orcid.org/0000-0002-9997-6817"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lambert, Nathan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074637993","display_name":"Ian Magnusson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Magnusson, Ian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088629436","display_name":"Jacob Morrison","orcid":"https://orcid.org/0000-0001-8592-4744"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Morrison, Jacob","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000043237","display_name":"Niklas Muennighoff","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Muennighoff, Niklas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087743328","display_name":"Aakanksha Naik","orcid":"https://orcid.org/0000-0002-3673-0051"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Naik, Aakanksha","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093854360","display_name":"Crystal Nam","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nam, Crystal","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101509445","display_name":"Matthew E. Peters","orcid":"https://orcid.org/0000-0002-2105-2585"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peters, Matthew E.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021017923","display_name":"Abhilasha Ravichander","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ravichander, Abhilasha","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039359682","display_name":"Kyle Richardson","orcid":"https://orcid.org/0000-0003-4836-0753"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Richardson, Kyle","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038732616","display_name":"Zejiang Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Zejiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051203366","display_name":"Emma Strubell","orcid":"https://orcid.org/0000-0003-2798-0726"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Strubell, Emma","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109481132","display_name":"Nishant Subramani","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Subramani, Nishant","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071250061","display_name":"Oyvind Tafjord","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tafjord, Oyvind","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051954092","display_name":"Pete Walsh","orcid":"https://orcid.org/0000-0002-1121-8481"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Walsh, Pete","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067919401","display_name":"Luke Zettlemoyer","orcid":"https://orcid.org/0009-0008-8296-0764"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zettlemoyer, Luke","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088517824","display_name":"Noah A. Smith","orcid":"https://orcid.org/0000-0002-2310-6380"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Smith, Noah A.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082305994","display_name":"Hannaneh Hajishirzi","orcid":"https://orcid.org/0000-0002-1055-6657"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hajishirzi, Hannaneh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090038537","display_name":"Iz Beltagy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Beltagy, Iz","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059265033","display_name":"Dirk Groeneveld","orcid":"https://orcid.org/0000-0002-8274-768X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Groeneveld, Dirk","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008013895","display_name":"Jesse Dodge","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dodge, Jesse","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5066588555","display_name":"Kyle Lo","orcid":"https://orcid.org/0000-0002-1804-2853"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lo, Kyle","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":36,"corresponding_author_ids":["https://openalex.org/A5060844217"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":9,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.7731000185012817,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.7731000185012817,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5847313404083252},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.48321211338043213},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.4472716748714447},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.355069637298584},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.07126304507255554}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5847313404083252},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.48321211338043213},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.4472716748714447},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.355069637298584},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.07126304507255554}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2402.00159","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.00159","pdf_url":"https://arxiv.org/pdf/2402.00159","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2402.00159","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2402.00159","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2402.00159","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2402.00159","pdf_url":"https://arxiv.org/pdf/2402.00159","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4391514311.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W2382290278","https://openalex.org/W2478288626","https://openalex.org/W2350741829","https://openalex.org/W2530322880","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Information":[0],"about":[1,107,133],"pretraining":[2],"corpora":[3],"used":[4],"to":[5,34,43,127,145],"train":[6],"the":[7],"current":[8],"best-performing":[9],"language":[10,50,68],"models":[11,16,24],"is":[12,41],"seldom":[13],"discussed:":[14],"commercial":[15],"rarely":[17],"detail":[18],"their":[19],"data,":[20],"and":[21,45,61,73,95,110,119],"even":[22],"open":[23],"are":[25],"often":[26],"released":[27],"without":[28],"accompanying":[29],"training":[30,56],"data":[31,57,135,142,159],"or":[32],"recipes":[33],"reproduce":[35],"them.":[36],"As":[37],"a":[38,76,82,111],"result,":[39],"it":[40],"challenging":[42],"conduct":[44],"advance":[46],"scientific":[47,65,88],"research":[48,66,156],"on":[49,67,122],"modeling,":[51],"such":[52],"as":[53,151,153],"understanding":[54],"how":[55],"impacts":[58],"model":[59,69],"capabilities":[60],"limitations.":[62],"To":[63],"facilitate":[64],"pretraining,":[70],"we":[71,130,139],"curate":[72],"release":[74],"Dolma,":[75,101],"three-trillion-token":[77],"English":[78],"corpus,":[79],"built":[80],"from":[81],"diverse":[83],"mixture":[84],"of":[85,113,125,148],"web":[86],"content,":[87],"papers,":[89],"code,":[90],"public-domain":[91],"books,":[92],"social":[93],"media,":[94],"encyclopedic":[96],"materials.":[97],"We":[98,116],"extensively":[99],"document":[100],"including":[102],"its":[103,108,114],"design":[104],"principles,":[105],"details":[106],"construction,":[109],"summary":[112],"contents.":[115],"present":[117],"analyses":[118],"experimental":[120],"results":[121],"intermediate":[123],"states":[124],"Dolma":[126],"share":[128],"what":[129],"have":[131],"learned":[132],"important":[134],"curation":[136,143],"practices.":[137],"Finally,":[138],"open-source":[140],"our":[141,149],"toolkit":[144],"enable":[146],"reproduction":[147],"work":[150],"well":[152],"support":[154],"further":[155],"in":[157],"large-scale":[158],"curation.":[160]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":6}],"updated_date":"2026-04-02T15:55:50.835912","created_date":"2024-02-04T00:00:00"}
