{"id":"https://openalex.org/W4412886753","doi":"https://doi.org/10.18653/v1/2025.acl-long.123","title":"Nemotron-CC: Transforming Common Crawl into a Refined Long-Horizon Pretraining Dataset","display_name":"Nemotron-CC: Transforming Common Crawl into a Refined Long-Horizon Pretraining Dataset","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4412886753","doi":"https://doi.org/10.18653/v1/2025.acl-long.123"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2025.acl-long.123","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-long.123","pdf_url":"https://aclanthology.org/2025.acl-long.123.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.acl-long.123.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5075183307","display_name":"Dan Su","orcid":"https://orcid.org/0000-0001-5746-9545"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dan Su","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006471838","display_name":"Kezhi Kong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kezhi Kong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088651115","display_name":"Ying Lin","orcid":"https://orcid.org/0000-0003-0152-2436"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ying Lin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045962920","display_name":"Joseph Jennings","orcid":"https://orcid.org/0000-0001-7093-355X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Joseph Jennings","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048591183","display_name":"Brandon Norick","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brandon Norick","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076322978","display_name":"Markus Kliegl","orcid":"https://orcid.org/0000-0001-6063-3959"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Markus Kliegl","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031170568","display_name":"Mostofa Patwary","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mostofa Patwary","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072436307","display_name":"Mohammad Shoeybi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mohammad Shoeybi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5066242985","display_name":"Bryan Catanzaro","orcid":"https://orcid.org/0000-0003-0034-7728"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bryan Catanzaro","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.9349,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.77167313,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"2459","last_page":"2475"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T14339","display_name":"Image Processing and 3D Reconstruction","score":0.7031000256538391,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T14339","display_name":"Image Processing and 3D Reconstruction","score":0.7031000256538391,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.6929000020027161,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.5946000218391418,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/horizon","display_name":"Horizon","score":0.6927010416984558},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5404409766197205},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4482628107070923},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.2158239185810089},{"id":"https://openalex.org/keywords/geometry","display_name":"Geometry","score":0.08299830555915833}],"concepts":[{"id":"https://openalex.org/C159176650","wikidata":"https://www.wikidata.org/wiki/Q43261","display_name":"Horizon","level":2,"score":0.6927010416984558},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5404409766197205},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4482628107070923},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2158239185810089},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.08299830555915833}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.acl-long.123","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-long.123","pdf_url":"https://aclanthology.org/2025.acl-long.123.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.acl-long.123","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-long.123","pdf_url":"https://aclanthology.org/2025.acl-long.123.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/13","score":0.4399999976158142,"display_name":"Climate action"}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4412886753.pdf","grobid_xml":"https://content.openalex.org/works/W4412886753.grobid-xml"},"referenced_works_count":1,"referenced_works":["https://openalex.org/W3093517588"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Dan":[0],"Su,":[1],"Kezhi":[2],"Kong,":[3],"Ying":[4],"Lin,":[5],"Joseph":[6],"Jennings,":[7],"Brandon":[8],"Norick,":[9],"Markus":[10],"Kliegl,":[11],"Mostofa":[12],"Patwary,":[13],"Mohammad":[14],"Shoeybi,":[15],"Bryan":[16],"Catanzaro.":[17],"Proceedings":[18],"of":[19,24],"the":[20,25],"63rd":[21],"Annual":[22],"Meeting":[23],"Association":[26],"for":[27],"Computational":[28],"Linguistics":[29],"(Volume":[30],"1:":[31],"Long":[32],"Papers).":[33],"2025.":[34]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
