{"id":"https://openalex.org/W4417539775","doi":"https://doi.org/10.1145/3788646.3789521","title":"A Common Pool of Privacy Problems: Legal and Technical Lessons from a Large-Scale Web-Scraped Machine Learning Dataset","display_name":"A Common Pool of Privacy Problems: Legal and Technical Lessons from a Large-Scale Web-Scraped Machine Learning Dataset","publication_year":2026,"publication_date":"2026-03-03","ids":{"openalex":"https://openalex.org/W4417539775","doi":"https://doi.org/10.1145/3788646.3789521"},"language":"en","primary_location":{"id":"doi:10.1145/3788646.3789521","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3788646.3789521","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Symposium on Computer Science and Law","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3788646.3789521","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Rachel Hong","orcid":"https://orcid.org/0009-0005-4275-653X"},"institutions":[{"id":"https://openalex.org/I201448701","display_name":"University of Washington","ror":"https://ror.org/00cvxb145","country_code":"US","type":"education","lineage":["https://openalex.org/I201448701"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Rachel Hong","raw_affiliation_strings":["University of Washington, Seattle, Washington, USA"],"raw_orcid":"https://orcid.org/0009-0005-4275-653X","affiliations":[{"raw_affiliation_string":"University of Washington, Seattle, Washington, USA","institution_ids":["https://openalex.org/I201448701"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jevan Hutson","orcid":"https://orcid.org/0000-0003-3312-1733"},"institutions":[{"id":"https://openalex.org/I201448701","display_name":"University of Washington","ror":"https://ror.org/00cvxb145","country_code":"US","type":"education","lineage":["https://openalex.org/I201448701"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jevan Hutson","raw_affiliation_strings":["University of Washington, Seattle, Washington, USA"],"raw_orcid":"https://orcid.org/0000-0003-3312-1733","affiliations":[{"raw_affiliation_string":"University of Washington, Seattle, Washington, USA","institution_ids":["https://openalex.org/I201448701"]}]},{"author_position":"middle","author":{"id":null,"display_name":"William Agnew","orcid":"https://orcid.org/0000-0002-1362-554X"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"William Agnew","raw_affiliation_strings":["Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"],"raw_orcid":"https://orcid.org/0000-0002-1362-554X","affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, Pittsburgh, Pennsylvania, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Imaad Huda","orcid":"https://orcid.org/0009-0009-7825-2850"},"institutions":[{"id":"https://openalex.org/I201448701","display_name":"University of Washington","ror":"https://ror.org/00cvxb145","country_code":"US","type":"education","lineage":["https://openalex.org/I201448701"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Imaad Huda","raw_affiliation_strings":["University of Washington, Seattle, Washington, USA"],"raw_orcid":"https://orcid.org/0009-0009-7825-2850","affiliations":[{"raw_affiliation_string":"University of Washington, Seattle, Washington, USA","institution_ids":["https://openalex.org/I201448701"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Tadayoshi Kohno","orcid":"https://orcid.org/0000-0002-4899-226X"},"institutions":[{"id":"https://openalex.org/I184565670","display_name":"Georgetown University","ror":"https://ror.org/05vzafd60","country_code":"US","type":"education","lineage":["https://openalex.org/I184565670"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tadayoshi Kohno","raw_affiliation_strings":["Georgetown University, Washington, D.C., USA"],"raw_orcid":"https://orcid.org/0000-0002-4899-226X","affiliations":[{"raw_affiliation_string":"Georgetown University, Washington, D.C., USA","institution_ids":["https://openalex.org/I184565670"]}]},{"author_position":"last","author":{"id":null,"display_name":"Jamie Morgenstern","orcid":"https://orcid.org/0000-0003-3753-8405"},"institutions":[{"id":"https://openalex.org/I201448701","display_name":"University of Washington","ror":"https://ror.org/00cvxb145","country_code":"US","type":"education","lineage":["https://openalex.org/I201448701"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jamie Morgenstern","raw_affiliation_strings":["University of Washington, Seattle, Washington, USA"],"raw_orcid":"https://orcid.org/0000-0003-3753-8405","affiliations":[{"raw_affiliation_string":"University of Washington, Seattle, Washington, USA","institution_ids":["https://openalex.org/I201448701"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.00218552,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"14"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.38280001282691956,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.38280001282691956,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10764","display_name":"Privacy-Preserving Technologies in Data","score":0.20499999821186066,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11045","display_name":"Privacy, Security, and Data Protection","score":0.14249999821186066,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/audit","display_name":"Audit","score":0.6678000092506409},{"id":"https://openalex.org/keywords/information-privacy","display_name":"Information privacy","score":0.597599983215332},{"id":"https://openalex.org/keywords/personally-identifiable-information","display_name":"Personally identifiable information","score":0.5432999730110168},{"id":"https://openalex.org/keywords/privacy-policy","display_name":"Privacy policy","score":0.5307000279426575},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4169999957084656},{"id":"https://openalex.org/keywords/empirical-research","display_name":"Empirical research","score":0.38609999418258667},{"id":"https://openalex.org/keywords/data-protection-act-1998","display_name":"Data Protection Act 1998","score":0.38600000739097595},{"id":"https://openalex.org/keywords/confidentiality","display_name":"Confidentiality","score":0.3677000105381012}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6891999840736389},{"id":"https://openalex.org/C199521495","wikidata":"https://www.wikidata.org/wiki/Q181487","display_name":"Audit","level":2,"score":0.6678000092506409},{"id":"https://openalex.org/C123201435","wikidata":"https://www.wikidata.org/wiki/Q456632","display_name":"Information privacy","level":2,"score":0.597599983215332},{"id":"https://openalex.org/C169093310","wikidata":"https://www.wikidata.org/wiki/Q3702971","display_name":"Personally identifiable information","level":2,"score":0.5432999730110168},{"id":"https://openalex.org/C102938260","wikidata":"https://www.wikidata.org/wiki/Q1999831","display_name":"Privacy policy","level":3,"score":0.5307000279426575},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.49720001220703125},{"id":"https://openalex.org/C108827166","wikidata":"https://www.wikidata.org/wiki/Q175975","display_name":"Internet privacy","level":1,"score":0.48069998621940613},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.4607999920845032},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4291999936103821},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4169999957084656},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.38609999418258667},{"id":"https://openalex.org/C69360830","wikidata":"https://www.wikidata.org/wiki/Q1172237","display_name":"Data Protection Act 1998","level":2,"score":0.38600000739097595},{"id":"https://openalex.org/C71745522","wikidata":"https://www.wikidata.org/wiki/Q2476929","display_name":"Confidentiality","level":2,"score":0.3677000105381012},{"id":"https://openalex.org/C91632574","wikidata":"https://www.wikidata.org/wiki/Q15088675","display_name":"Data curation","level":2,"score":0.3287999927997589},{"id":"https://openalex.org/C137822555","wikidata":"https://www.wikidata.org/wiki/Q2587068","display_name":"Information sensitivity","level":2,"score":0.3271999955177307},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.30149999260902405},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.29260000586509705},{"id":"https://openalex.org/C151201525","wikidata":"https://www.wikidata.org/wiki/Q177239","display_name":"Limit (mathematics)","level":2,"score":0.28780001401901245},{"id":"https://openalex.org/C193934123","wikidata":"https://www.wikidata.org/wiki/Q7246028","display_name":"Privacy by Design","level":3,"score":0.2858000099658966},{"id":"https://openalex.org/C184356942","wikidata":"https://www.wikidata.org/wiki/Q830382","display_name":"Best practice","level":2,"score":0.2854999899864197},{"id":"https://openalex.org/C3017597292","wikidata":"https://www.wikidata.org/wiki/Q25052250","display_name":"Privacy protection","level":2,"score":0.2833000123500824},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.275299996137619},{"id":"https://openalex.org/C2780535194","wikidata":"https://www.wikidata.org/wiki/Q309901","display_name":"Open data","level":2,"score":0.26330000162124634},{"id":"https://openalex.org/C141972696","wikidata":"https://www.wikidata.org/wiki/Q1247836","display_name":"Privacy law","level":4,"score":0.26269999146461487},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.26159998774528503},{"id":"https://openalex.org/C56739046","wikidata":"https://www.wikidata.org/wiki/Q192060","display_name":"Knowledge management","level":1,"score":0.25119999051094055}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3788646.3789521","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3788646.3789521","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Symposium on Computer Science and Law","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2506.17185","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.17185","pdf_url":"https://arxiv.org/pdf/2506.17185","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2506.17185","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.17185","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.1145/3788646.3789521","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3788646.3789521","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Symposium on Computer Science and Law","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4203830979","display_name":"CAREER: Strategic and Equity Considerations in Machine Learning","funder_award_id":"2045402","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G4640451012","display_name":"Collaborative Proposal: SaTC: Frontiers: Securing the Future of Computing for Marginalized and Vulnerable Populations","funder_award_id":"2205171","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8488338264","display_name":null,"funder_award_id":"CNS-2205171","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320310094","display_name":"University of Washington","ror":"https://ror.org/00cvxb145"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,88,110],"investigate":[1],"the":[2,39,75,150,159],"contents":[3],"of":[4,27,43,52,61,92,115,140,143,152,158],"web-scraped":[5,44,80],"data":[6,107,117],"for":[7,138],"training":[8,55],"AI":[9,153],"systems,":[10],"at":[11],"sizes":[12],"where":[13],"human":[14],"dataset":[15,81,95],"curators":[16],"and":[17,106,133],"compilers":[18],"no":[19],"longer":[20],"manually":[21],"annotate":[22],"every":[23],"sample.":[24],"Building":[25],"off":[26],"prior":[28],"privacy":[29,41,105],"concerns":[30],"in":[31],"machine":[32,45],"learning":[33,46],"models,":[34],"we":[35,57,136],"ask:":[36],"What":[37],"are":[38],"legal":[40,99,113,134],"implications":[42],"datasets?":[47],"In":[48],"an":[49],"empirical":[50,132],"study":[51],"a":[53,93],"popular":[54],"dataset,":[56],"find":[58],"significant":[59],"presence":[60],"personally":[62],"identifiable":[63],"information":[64,124,146],"despite":[65],"sanitization":[66],"efforts.":[67],"Our":[68],"audit":[69],"provides":[70],"concrete":[71],"evidence":[72],"to":[73,96,103,125,147],"support":[74],"concern":[76],"that":[77,120],"any":[78],"large-scale":[79],"may":[82,121],"contain":[83],"legally":[84],"defined":[85],"personal":[86,123],"data.":[87],"use":[89],"these":[90],"findings":[91],"real-world":[94],"inform":[97],"our":[98,131],"analysis":[100],"with":[101],"respect":[102],"existing":[104],"protection":[108],"laws.":[109],"surface":[111],"various":[112],"risks":[114],"current":[116,141],"curation":[118],"practices":[119],"propagate":[122],"train":[126],"downstream":[127],"models.":[128],"Based":[129],"on":[130],"analyses,":[135],"argue":[137],"reorientation":[139],"frameworks":[142],"\u201cpublicly":[144],"available\u201d":[145],"meaningfully":[148],"limit":[149],"development":[151],"built":[154],"upon":[155],"indiscriminate":[156],"scraping":[157],"internet.":[160]},"counts_by_year":[],"updated_date":"2026-06-22T08:00:12.763002","created_date":"2025-10-10T00:00:00"}
