{"id":"https://openalex.org/W7148454734","doi":"https://doi.org/10.48550/arxiv.2604.00920","title":"GPT-NL Public Corpus: A Permissively Licensed, Dutch-First Dataset for LLM Pre-training","display_name":"GPT-NL Public Corpus: A Permissively Licensed, Dutch-First Dataset for LLM Pre-training","publication_year":2026,"publication_date":"2026-04-01","ids":{"openalex":"https://openalex.org/W7148454734","doi":"https://doi.org/10.48550/arxiv.2604.00920"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.00920","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00920","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.00920","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5109738270","display_name":"Jesse van Oort","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"van Oort, Jesse","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132813610","display_name":"Frank Brinkkemper","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brinkkemper, Frank","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132800522","display_name":"Erik de Graaf","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"de Graaf, Erik","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132790525","display_name":"Bram Vanroy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vanroy, Bram","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132794431","display_name":"Saskia Lensink","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lensink, Saskia","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5109738270"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.20739999413490295,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.20739999413490295,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.11949999630451202,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14330","display_name":"Library Science and Information Systems","score":0.09210000187158585,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/corpus-linguistics","display_name":"Corpus linguistics","score":0.48820000886917114},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.3693000078201294},{"id":"https://openalex.org/keywords/text-corpus","display_name":"Text corpus","score":0.35040000081062317},{"id":"https://openalex.org/keywords/text-messaging","display_name":"Text messaging","score":0.2687999904155731}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6097000241279602},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5432999730110168},{"id":"https://openalex.org/C532629269","wikidata":"https://www.wikidata.org/wiki/Q865083","display_name":"Corpus linguistics","level":2,"score":0.48820000886917114},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42649999260902405},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.3693000078201294},{"id":"https://openalex.org/C2474386","wikidata":"https://www.wikidata.org/wiki/Q461183","display_name":"Text corpus","level":2,"score":0.35040000081062317},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3325999975204468},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.32690000534057617},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.2766000032424927},{"id":"https://openalex.org/C3018949938","wikidata":"https://www.wikidata.org/wiki/Q17166101","display_name":"Text messaging","level":2,"score":0.2687999904155731},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.2678000032901764},{"id":"https://openalex.org/C2776390805","wikidata":"https://www.wikidata.org/wiki/Q7258035","display_name":"Public use","level":2,"score":0.2540999948978424}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.00920","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00920","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.00920","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00920","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.6525780558586121,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"present":[1,29],"the":[2,6,37,105,109,125,152],"GPT-NL":[3,16,126],"Public":[4,17,127],"Corpus,":[5],"biggest":[7],"permissively":[8],"licensed":[9],"corpus":[10,38,60],"of":[11,87,107,111],"Dutch":[12,26,84],"language":[13,113],"resources.":[14],"The":[15,145],"Corpus":[18,70,128],"contains":[19],"21":[20],"Dutch-only":[21],"collections":[22,85],"totalling":[23],"36B":[24],"preprocessed":[25],"tokens":[27,48],"not":[28],"in":[30,90,124],"any":[31],"other":[32],"LLM":[33],"pretraining":[34],"corpus.":[35],"Additionally,":[36],"includes":[39,61],"roughly":[40],"207B":[41],"English,":[42],"232B":[43],"Code,":[44],"and":[45,71,102,119,136,139],"48B":[46],"German/Danish":[47],"taken":[49],"from":[50,64,131],"existing":[51,66],"sets":[52],"which":[53],"we":[54],"further":[55],"curated":[56,62,138],"for":[57],"compliance.":[58],"This":[59],"data":[63,99,122],"large":[65],"corpora":[67],"like":[68],"Common":[69,72],"Crawl,":[73],"as":[74,76],"well":[75],"newly":[77,82],"created":[78,83],"Dutch-specific":[79],"collections.":[80],"Most":[81],"consist":[86],"content":[88],"collected":[89,101],"collaboration":[91],"with":[92,104,133],"organisations":[93],"or":[94],"synthetically":[95],"augmented":[96],"content.":[97],"All":[98,121],"is":[100,129,137,148],"evaluated":[103],"aim":[106],"facilitating":[108],"creation":[110],"(commercial)":[112],"models":[114],"that":[115],"are":[116],"lawful,":[117],"useful":[118],"non-harmful.":[120],"included":[123],"sourced":[130],"datasets":[132],"permissive":[134],"licensing":[135],"redistributed":[140],"under":[141],"a":[142],"CC-BY":[143],"license.":[144],"full":[146],"dataset":[147],"publicly":[149],"available":[150],"on":[151],"Hugging":[153],"Face":[154],"Hub.":[155]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
