{"id":"https://openalex.org/W4402057292","doi":"https://doi.org/10.1038/s42256-024-00878-8","title":"A large-scale audit of dataset licensing and attribution in AI","display_name":"A large-scale audit of dataset licensing and attribution in AI","publication_year":2024,"publication_date":"2024-08-30","ids":{"openalex":"https://openalex.org/W4402057292","doi":"https://doi.org/10.1038/s42256-024-00878-8"},"language":"en","primary_location":{"id":"doi:10.1038/s42256-024-00878-8","is_oa":true,"landing_page_url":"https://doi.org/10.1038/s42256-024-00878-8","pdf_url":"https://www.nature.com/articles/s42256-024-00878-8.pdf","source":{"id":"https://openalex.org/S2912241403","display_name":"Nature Machine Intelligence","issn_l":"2522-5839","issn":["2522-5839"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319908","host_organization_name":"Nature Portfolio","host_organization_lineage":["https://openalex.org/P4310319908","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Nature Portfolio","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Nature Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://www.nature.com/articles/s42256-024-00878-8.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001884064","display_name":"Shayne Longpre","orcid":null},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Shayne Longpre","raw_affiliation_strings":["Media Lab, Massachusetts Institute of Technology, Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Media Lab, Massachusetts Institute of Technology, Cambridge, MA, USA","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028144841","display_name":"Robert Mahari","orcid":"https://orcid.org/0000-0003-2372-2746"},"institutions":[{"id":"https://openalex.org/I2801851002","display_name":"Harvard University Press","ror":"https://ror.org/006v7bf86","country_code":"US","type":"other","lineage":["https://openalex.org/I136199984","https://openalex.org/I2801851002"]},{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Robert Mahari","raw_affiliation_strings":["Harvard Law School, Harvard University, Cambridge, MA, USA","Media Lab, Massachusetts Institute of Technology, Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Harvard Law School, Harvard University, Cambridge, MA, USA","institution_ids":["https://openalex.org/I2801851002"]},{"raw_affiliation_string":"Media Lab, Massachusetts Institute of Technology, Cambridge, MA, USA","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087171054","display_name":"Anthony Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I204250578","display_name":"University of California, Irvine","ror":"https://ror.org/04gyf1771","country_code":"US","type":"education","lineage":["https://openalex.org/I204250578"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Anthony Chen","raw_affiliation_strings":["Department of Computer Science, University of California, Irvine, CA, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, University of California, Irvine, CA, USA","institution_ids":["https://openalex.org/I204250578"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092064801","display_name":"Naana Obeng-Marnu","orcid":null},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Naana Obeng-Marnu","raw_affiliation_strings":["Center for Constructive Communication, Massachusetts Institute of Technology, Cambridge, MA, USA","Media Lab, Massachusetts Institute of Technology, Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Center for Constructive Communication, Massachusetts Institute of Technology, Cambridge, MA, USA","institution_ids":["https://openalex.org/I63966007"]},{"raw_affiliation_string":"Media Lab, Massachusetts Institute of Technology, Cambridge, MA, USA","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068059558","display_name":"Damien Sileo","orcid":"https://orcid.org/0000-0002-3274-291X"},"institutions":[{"id":"https://openalex.org/I2279609970","display_name":"Universit\u00e9 de Lille","ror":"https://ror.org/02kzqn938","country_code":"FR","type":"education","lineage":["https://openalex.org/I2279609970"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Damien Sileo","raw_affiliation_strings":["Inria Centre, University of Lille, Lille, France"],"affiliations":[{"raw_affiliation_string":"Inria Centre, University of Lille, Lille, France","institution_ids":["https://openalex.org/I2279609970"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079113224","display_name":"William Brannon","orcid":"https://orcid.org/0000-0002-1435-8535"},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"William Brannon","raw_affiliation_strings":["Center for Constructive Communication, Massachusetts Institute of Technology, Cambridge, MA, USA","Media Lab, Massachusetts Institute of Technology, Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Center for Constructive Communication, Massachusetts Institute of Technology, Cambridge, MA, USA","institution_ids":["https://openalex.org/I63966007"]},{"raw_affiliation_string":"Media Lab, Massachusetts Institute of Technology, Cambridge, MA, USA","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000043237","display_name":"Niklas Muennighoff","orcid":null},"institutions":[{"id":"https://openalex.org/I4210128771","display_name":"Contextual Change (United States)","ror":"https://ror.org/03bskcm82","country_code":"US","type":"company","lineage":["https://openalex.org/I4210128771"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Niklas Muennighoff","raw_affiliation_strings":["Contextual AI, Mountain View, CA, USA"],"affiliations":[{"raw_affiliation_string":"Contextual AI, Mountain View, CA, USA","institution_ids":["https://openalex.org/I4210128771"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093134538","display_name":"Nathan Khazam","orcid":null},"institutions":[{"id":"https://openalex.org/I188538660","display_name":"University of Colorado Boulder","ror":"https://ror.org/02ttsq026","country_code":"US","type":"education","lineage":["https://openalex.org/I188538660"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nathan Khazam","raw_affiliation_strings":["College of Engineering & Applied Science, University of Colorado Boulder, Boulder, CO, USA"],"affiliations":[{"raw_affiliation_string":"College of Engineering & Applied Science, University of Colorado Boulder, Boulder, CO, USA","institution_ids":["https://openalex.org/I188538660"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026429870","display_name":"Jad Kabbara","orcid":null},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jad Kabbara","raw_affiliation_strings":["Center for Constructive Communication, Massachusetts Institute of Technology, Cambridge, MA, USA","Media Lab, Massachusetts Institute of Technology, Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Center for Constructive Communication, Massachusetts Institute of Technology, Cambridge, MA, USA","institution_ids":["https://openalex.org/I63966007"]},{"raw_affiliation_string":"Media Lab, Massachusetts Institute of Technology, Cambridge, MA, USA","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003528238","display_name":"Kartik Perisetla","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kartik Perisetla","raw_affiliation_strings":["Data Provenance Initiative, Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Data Provenance Initiative, Cambridge, MA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101882209","display_name":"Xinyi Wu","orcid":"https://orcid.org/0000-0001-9475-3510"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xinyi Wu","raw_affiliation_strings":["Human-Computer Interaction Institute, Carnegie Mellon University, Pittsburgh, PA, USA"],"affiliations":[{"raw_affiliation_string":"Human-Computer Interaction Institute, Carnegie Mellon University, Pittsburgh, PA, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092741733","display_name":"Enrico Shippole","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Enrico Shippole","raw_affiliation_strings":["Teraflop AI, Boca Raton, FL, USA"],"affiliations":[{"raw_affiliation_string":"Teraflop AI, Boca Raton, FL, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013580260","display_name":"Kurt Bollacker","orcid":null},"institutions":[{"id":"https://openalex.org/I4210120668","display_name":"Creative Commons","ror":"https://ror.org/02ed4cj64","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210120668"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kurt Bollacker","raw_affiliation_strings":["ML Commons, San Francisco, CA, USA"],"affiliations":[{"raw_affiliation_string":"ML Commons, San Francisco, CA, USA","institution_ids":["https://openalex.org/I4210120668"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004225142","display_name":"Tongshuang Wu","orcid":"https://orcid.org/0000-0003-1630-0588"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tongshuang Wu","raw_affiliation_strings":["Human-Computer Interaction Institute, Carnegie Mellon University, Pittsburgh, PA, USA"],"affiliations":[{"raw_affiliation_string":"Human-Computer Interaction Institute, Carnegie Mellon University, Pittsburgh, PA, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108457098","display_name":"L. F. Villa","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luis Villa","raw_affiliation_strings":["Tidelift, Boston, MA, USA"],"affiliations":[{"raw_affiliation_string":"Tidelift, Boston, MA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107954910","display_name":"Sandy Pentland","orcid":null},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sandy Pentland","raw_affiliation_strings":["Media Lab, Massachusetts Institute of Technology, Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"Media Lab, Massachusetts Institute of Technology, Cambridge, MA, USA","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5078850040","display_name":"Sara Hooker","orcid":"https://orcid.org/0000-0002-0190-6459"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sara Hooker","raw_affiliation_strings":["Cohere For AI, Toronto, Ontario, Canada"],"affiliations":[{"raw_affiliation_string":"Cohere For AI, Toronto, Ontario, Canada","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":17,"corresponding_author_ids":["https://openalex.org/A5001884064"],"corresponding_institution_ids":["https://openalex.org/I63966007"],"apc_list":{"value":9750,"currency":"EUR","value_usd":11690},"apc_paid":{"value":9750,"currency":"EUR","value_usd":11690},"fwci":25.8939,"has_fulltext":true,"cited_by_count":51,"citation_normalized_percentile":{"value":0.99606806,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":100},"biblio":{"volume":"6","issue":"8","first_page":"975","last_page":"987"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.9905999898910522,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.9905999898910522,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.9889000058174133,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.984499990940094,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transparency","display_name":"Transparency (behavior)","score":0.7187000513076782},{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.6135375499725342},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6062200665473938},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.540675163269043},{"id":"https://openalex.org/keywords/trace","display_name":"TRACE (psycholinguistics)","score":0.45816734433174133},{"id":"https://openalex.org/keywords/bloom-filter","display_name":"Bloom filter","score":0.44682541489601135},{"id":"https://openalex.org/keywords/internet-privacy","display_name":"Internet privacy","score":0.43045783042907715},{"id":"https://openalex.org/keywords/data-breach","display_name":"Data breach","score":0.429929256439209},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.33656466007232666},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.30880647897720337},{"id":"https://openalex.org/keywords/business","display_name":"Business","score":0.21489712595939636},{"id":"https://openalex.org/keywords/accounting","display_name":"Accounting","score":0.16241714358329773}],"concepts":[{"id":"https://openalex.org/C2780233690","wikidata":"https://www.wikidata.org/wiki/Q535347","display_name":"Transparency (behavior)","level":2,"score":0.7187000513076782},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.6135375499725342},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6062200665473938},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.540675163269043},{"id":"https://openalex.org/C75291252","wikidata":"https://www.wikidata.org/wiki/Q1315756","display_name":"TRACE (psycholinguistics)","level":2,"score":0.45816734433174133},{"id":"https://openalex.org/C147224247","wikidata":"https://www.wikidata.org/wiki/Q885373","display_name":"Bloom filter","level":2,"score":0.44682541489601135},{"id":"https://openalex.org/C108827166","wikidata":"https://www.wikidata.org/wiki/Q175975","display_name":"Internet privacy","level":1,"score":0.43045783042907715},{"id":"https://openalex.org/C165609540","wikidata":"https://www.wikidata.org/wiki/Q1172486","display_name":"Data breach","level":2,"score":0.429929256439209},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.33656466007232666},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.30880647897720337},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.21489712595939636},{"id":"https://openalex.org/C121955636","wikidata":"https://www.wikidata.org/wiki/Q4116214","display_name":"Accounting","level":1,"score":0.16241714358329773},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1038/s42256-024-00878-8","is_oa":true,"landing_page_url":"https://doi.org/10.1038/s42256-024-00878-8","pdf_url":"https://www.nature.com/articles/s42256-024-00878-8.pdf","source":{"id":"https://openalex.org/S2912241403","display_name":"Nature Machine Intelligence","issn_l":"2522-5839","issn":["2522-5839"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319908","host_organization_name":"Nature Portfolio","host_organization_lineage":["https://openalex.org/P4310319908","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Nature Portfolio","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Nature Machine Intelligence","raw_type":"journal-article"},{"id":"pmh:oai:HAL:hal-04749695v1","is_oa":true,"landing_page_url":"https://hal.science/hal-04749695","pdf_url":"https://hal.science/hal-04749695/document","source":{"id":"https://openalex.org/S4406922466","display_name":"SPIRE - Sciences Po Institutional REpository","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Nature Machine Intelligence, 2024, 6 (8), pp.975-987. &#x27E8;10.1038/s42256-024-00878-8&#x27E9;","raw_type":"Journal articles"}],"best_oa_location":{"id":"doi:10.1038/s42256-024-00878-8","is_oa":true,"landing_page_url":"https://doi.org/10.1038/s42256-024-00878-8","pdf_url":"https://www.nature.com/articles/s42256-024-00878-8.pdf","source":{"id":"https://openalex.org/S2912241403","display_name":"Nature Machine Intelligence","issn_l":"2522-5839","issn":["2522-5839"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319908","host_organization_name":"Nature Portfolio","host_organization_lineage":["https://openalex.org/P4310319908","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Nature Portfolio","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Nature Machine Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4402057292.pdf"},"referenced_works_count":59,"referenced_works":["https://openalex.org/W1525961042","https://openalex.org/W2279316390","https://openalex.org/W2770618123","https://openalex.org/W2897042519","https://openalex.org/W2911227954","https://openalex.org/W3083410900","https://openalex.org/W3100279624","https://openalex.org/W3137010024","https://openalex.org/W3154151289","https://openalex.org/W3163073193","https://openalex.org/W3172314079","https://openalex.org/W3197876970","https://openalex.org/W3203385474","https://openalex.org/W3205068155","https://openalex.org/W3212368439","https://openalex.org/W3213241618","https://openalex.org/W4206637810","https://openalex.org/W4221159672","https://openalex.org/W4223908421","https://openalex.org/W4224275713","https://openalex.org/W4226227340","https://openalex.org/W4226278401","https://openalex.org/W4281690148","https://openalex.org/W4283155630","https://openalex.org/W4285199616","https://openalex.org/W4296413526","https://openalex.org/W4317553041","https://openalex.org/W4318908031","https://openalex.org/W4322718191","https://openalex.org/W4360836968","https://openalex.org/W4365794116","https://openalex.org/W4366341216","https://openalex.org/W4367000491","https://openalex.org/W4379468930","https://openalex.org/W4379539933","https://openalex.org/W4380769213","https://openalex.org/W4380993527","https://openalex.org/W4381889417","https://openalex.org/W4384155024","https://openalex.org/W4384918448","https://openalex.org/W4385474529","https://openalex.org/W4385570984","https://openalex.org/W4385571124","https://openalex.org/W4385572634","https://openalex.org/W4385849242","https://openalex.org/W4389519291","https://openalex.org/W4389519446","https://openalex.org/W4389520710","https://openalex.org/W4389524372","https://openalex.org/W4401042685","https://openalex.org/W6782465632","https://openalex.org/W6788175385","https://openalex.org/W6800875267","https://openalex.org/W6847076894","https://openalex.org/W6849116879","https://openalex.org/W6852449896","https://openalex.org/W6852800892","https://openalex.org/W6858193405","https://openalex.org/W6949315280"],"related_works":["https://openalex.org/W2086572746","https://openalex.org/W2604468458","https://openalex.org/W2157216338","https://openalex.org/W121740227","https://openalex.org/W1662107788","https://openalex.org/W3082379938","https://openalex.org/W2135966669","https://openalex.org/W2594143027","https://openalex.org/W2732769800","https://openalex.org/W2924589531"],"abstract_inverted_index":{"Abstract":[0],"The":[1],"race":[2],"to":[3,37,52,97,158,164,186,189],"train":[4],"language":[5],"models":[6],"on":[7,107,193],"vast,":[8],"diverse":[9],"and":[10,17,24,33,40,50,64,76,91,120,133,155,170,191],"inconsistently":[11],"documented":[12],"datasets":[13,138],"raises":[14],"pressing":[15],"legal":[16,32],"ethical":[18],"concerns.":[19],"To":[20],"improve":[21],"data":[22,79,94,146,194,201],"transparency":[23,169],"understanding,":[25],"we":[26,173],"convene":[27],"a":[28,129,162],"multi-disciplinary":[29],"effort":[30],"between":[31],"machine":[34],"learning":[35],"experts":[36],"systematically":[38],"audit":[39],"trace":[41,53,190],"more":[42,117,124],"than":[43,118,125],"1,800":[44],"text":[45],"datasets.":[46],"We":[47,101],"develop":[48],"tools":[49],"standards":[51],"the":[54,74,150,182,197],"lineage":[55],"of":[56,78,105,116,123,136,145,152],"these":[57],"datasets,":[58],"including":[59,86],"their":[60],"source,":[61],"creators,":[62],"licences":[63,106],"subsequent":[65],"use.":[66,83],"Our":[67,143],"landscape":[68],"analysis":[69,144],"highlights":[70,128],"sharp":[71],"divides":[72],"in":[73,131,167],"composition":[75],"focus":[77],"licenced":[80],"for":[81,196],"commercial":[82],"Important":[84],"categories":[85],"low-resource":[87],"languages,":[88],"creative":[89],"tasks":[90],"new":[92],"synthetic":[93],"all":[95],"tend":[96],"be":[98],"restrictively":[99],"licenced.":[100],"observe":[102],"frequent":[103],"miscategorization":[104],"popular":[108,137,199],"dataset":[109,168],"hosting":[110],"sites,":[111],"with":[112,177],"licence":[113],"omission":[114],"rates":[115,122],"70%":[119],"error":[121],"50%.":[126],"This":[127],"crisis":[130],"misattribution":[132],"informed":[134],"use":[135,157],"driving":[139],"many":[140],"recent":[141],"breakthroughs.":[142],"sources":[147],"also":[148],"explains":[149],"application":[151],"copyright":[153],"law":[154],"fair":[156],"finetuning":[159,200],"data.":[160],"As":[161],"contribution":[163],"continuing":[165],"improvements":[166],"responsible":[171],"use,":[172],"release":[174],"our":[175],"audit,":[176],"an":[178],"interactive":[179],"user":[180],"interface,":[181],"Data":[183],"Provenance":[184],"Explorer,":[185],"enable":[187],"practitioners":[188],"filter":[192],"provenance":[195],"most":[198],"collections:":[202],"www.dataprovenance.org":[203],".":[204]},"counts_by_year":[{"year":2026,"cited_by_count":10},{"year":2025,"cited_by_count":37},{"year":2024,"cited_by_count":4}],"updated_date":"2026-04-14T08:04:32.555800","created_date":"2024-08-31T00:00:00"}
