{"id":"https://openalex.org/W4402671286","doi":"https://doi.org/10.18653/v1/2024.acl-long.840","title":"Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research","display_name":"Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4402671286","doi":"https://doi.org/10.18653/v1/2024.acl-long.840"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2024.acl-long.840","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2024.acl-long.840","pdf_url":"https://aclanthology.org/2024.acl-long.840.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2024.acl-long.840.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5060844217","display_name":"Luca Soldaini","orcid":"https://orcid.org/0000-0001-6998-9863"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Luca Soldaini","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056105462","display_name":"Rodney Kinney","orcid":"https://orcid.org/0000-0002-3582-6537"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rodney Kinney","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041892782","display_name":"Akshita Bhagia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Akshita Bhagia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007645100","display_name":"Dustin Schwenk","orcid":"https://orcid.org/0000-0003-1666-6889"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dustin Schwenk","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041721032","display_name":"David Atkinson","orcid":"https://orcid.org/0000-0003-1124-6666"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"David Atkinson","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083280536","display_name":"Russell Authur","orcid":"https://orcid.org/0009-0002-1238-4323"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Russell Authur","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051169643","display_name":"Ben Bogin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ben Bogin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007169600","display_name":"Khyathi Raghavi Chandu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Khyathi Chandu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001761515","display_name":"Jennifer Dumas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jennifer Dumas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081462507","display_name":"Yanai Elazar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yanai Elazar","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014923355","display_name":"Valentin Hofmann","orcid":"https://orcid.org/0000-0001-6603-3428"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Valentin Hofmann","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058654569","display_name":"Ananya Jha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ananya Jha","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032948428","display_name":"Sachin Kumar","orcid":"https://orcid.org/0000-0003-3949-0302"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sachin Kumar","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072860546","display_name":"Li Lucy","orcid":"https://orcid.org/0009-0002-8077-6310"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li Lucy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009648800","display_name":"Xinxi Lyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xinxi Lyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083548887","display_name":"Nathan Lambert","orcid":"https://orcid.org/0000-0002-9997-6817"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nathan Lambert","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074637993","display_name":"Ian Magnusson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ian Magnusson","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088629436","display_name":"Jacob Morrison","orcid":"https://orcid.org/0000-0001-8592-4744"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jacob Morrison","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000043237","display_name":"Niklas Muennighoff","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Niklas Muennighoff","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087743328","display_name":"Aakanksha Naik","orcid":"https://orcid.org/0000-0002-3673-0051"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Aakanksha Naik","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093854360","display_name":"Crystal Nam","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Crystal Nam","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101509445","display_name":"Matthew E. Peters","orcid":"https://orcid.org/0000-0002-2105-2585"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Matthew Peters","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021017923","display_name":"Abhilasha Ravichander","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abhilasha Ravichander","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039359682","display_name":"Kyle Richardson","orcid":"https://orcid.org/0000-0003-4836-0753"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kyle Richardson","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038732616","display_name":"Zejiang Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zejiang Shen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051203366","display_name":"Emma Strubell","orcid":"https://orcid.org/0000-0003-2798-0726"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Emma Strubell","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109481132","display_name":"Nishant Subramani","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nishant Subramani","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071250061","display_name":"Oyvind Tafjord","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Oyvind Tafjord","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084656892","display_name":"Evan Walsh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Evan Walsh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067919401","display_name":"Luke Zettlemoyer","orcid":"https://orcid.org/0009-0008-8296-0764"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luke Zettlemoyer","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102004623","display_name":"Noah A. Smith","orcid":"https://orcid.org/0000-0002-2387-9789"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Noah Smith","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082305994","display_name":"Hannaneh Hajishirzi","orcid":"https://orcid.org/0000-0002-1055-6657"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hannaneh Hajishirzi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090038537","display_name":"Iz Beltagy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Iz Beltagy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059265033","display_name":"Dirk Groeneveld","orcid":"https://orcid.org/0000-0002-8274-768X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dirk Groeneveld","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008013895","display_name":"Jesse Dodge","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jesse Dodge","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5066588555","display_name":"Kyle Lo","orcid":"https://orcid.org/0000-0002-1804-2853"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kyle Lo","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":36,"corresponding_author_ids":["https://openalex.org/A5060844217"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":12.8469,"has_fulltext":true,"cited_by_count":38,"citation_normalized_percentile":{"value":0.989884,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"15725","last_page":"15788"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.7264999747276306,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.7264999747276306,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7677377462387085},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6045414209365845},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5234636664390564},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5132734775543213}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7677377462387085},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6045414209365845},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5234636664390564},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5132734775543213}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2024.acl-long.840","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2024.acl-long.840","pdf_url":"https://aclanthology.org/2024.acl-long.840.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2024.acl-long.840","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2024.acl-long.840","pdf_url":"https://aclanthology.org/2024.acl-long.840.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4402671286.pdf","grobid_xml":"https://content.openalex.org/works/W4402671286.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Luca":[0],"Soldaini,":[1],"Rodney":[2],"Kinney,":[3],"Akshita":[4],"Bhagia,":[5],"Dustin":[6],"Schwenk,":[7],"David":[8],"Atkinson,":[9],"Russell":[10],"Authur,":[11],"Ben":[12],"Bogin,":[13],"Khyathi":[14],"Chandu,":[15],"Jennifer":[16],"Dumas,":[17],"Yanai":[18],"Elazar,":[19],"Valentin":[20],"Hofmann,":[21],"Ananya":[22],"Jha,":[23],"Sachin":[24],"Kumar,":[25],"Li":[26],"Lucy,":[27],"Xinxi":[28],"Lyu,":[29],"Nathan":[30],"Lambert,":[31],"Ian":[32],"Magnusson,":[33],"Jacob":[34],"Morrison,":[35],"Niklas":[36],"Muennighoff,":[37],"Aakanksha":[38],"Naik,":[39],"Crystal":[40],"Nam,":[41],"Matthew":[42],"Peters,":[43],"Abhilasha":[44],"Ravichander,":[45],"Kyle":[46,70],"Richardson,":[47],"Zejiang":[48],"Shen,":[49],"Emma":[50],"Strubell,":[51],"Nishant":[52],"Subramani,":[53],"Oyvind":[54],"Tafjord,":[55],"Evan":[56],"Walsh,":[57],"Luke":[58],"Zettlemoyer,":[59],"Noah":[60],"Smith,":[61],"Hannaneh":[62],"Hajishirzi,":[63],"Iz":[64],"Beltagy,":[65],"Dirk":[66],"Groeneveld,":[67],"Jesse":[68],"Dodge,":[69],"Lo.":[71],"Proceedings":[72],"of":[73,78],"the":[74,79],"62nd":[75],"Annual":[76],"Meeting":[77],"Association":[80],"for":[81],"Computational":[82],"Linguistics":[83],"(Volume":[84],"1:":[85],"Long":[86],"Papers).":[87],"2024.":[88]},"counts_by_year":[{"year":2026,"cited_by_count":7},{"year":2025,"cited_by_count":20},{"year":2024,"cited_by_count":11}],"updated_date":"2026-04-14T08:04:32.555800","created_date":"2025-10-10T00:00:00"}
