{"id":"https://openalex.org/W7134818046","doi":"https://doi.org/10.48550/arxiv.2603.06603","title":"Scale Dependent Data Duplication","display_name":"Scale Dependent Data Duplication","publication_year":2026,"publication_date":"2026-02-18","ids":{"openalex":"https://openalex.org/W7134818046","doi":"https://doi.org/10.48550/arxiv.2603.06603"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.06603","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128639287","display_name":"Joshua Kazdan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kazdan, Joshua","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128646085","display_name":"Noam Levi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Levi, Noam","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058188131","display_name":"Rylan Schaeffer","orcid":"https://orcid.org/0000-0002-4298-7216"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schaeffer, Rylan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128655197","display_name":"Jessica Chudnovsky","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chudnovsky, Jessica","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128681167","display_name":"Abhay Puri","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Puri, Abhay","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128672260","display_name":"Bo He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Bo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128643589","display_name":"Mehmet Donmez","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Donmez, Mehmet","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128667885","display_name":"Sanmi Koyejo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Koyejo, Sanmi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128662418","display_name":"David Donoho","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Donoho, David","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.3206000030040741,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.3206000030040741,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.045499999076128006,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13720","display_name":"Benford\u2019s Law and Fraud Detection","score":0.042899999767541885,"subfield":{"id":"https://openalex.org/subfields/2613","display_name":"Statistics and Probability"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/data-deduplication","display_name":"Data deduplication","score":0.7739999890327454},{"id":"https://openalex.org/keywords/cosine-similarity","display_name":"Cosine similarity","score":0.5846999883651733},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.5803999900817871},{"id":"https://openalex.org/keywords/semantic-similarity","display_name":"Semantic similarity","score":0.5263000130653381},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.5116000175476074},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4952000081539154},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.478300005197525},{"id":"https://openalex.org/keywords/uniqueness","display_name":"Uniqueness","score":0.47200000286102295},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.4323999881744385}],"concepts":[{"id":"https://openalex.org/C32587265","wikidata":"https://www.wikidata.org/wiki/Q1182260","display_name":"Data deduplication","level":2,"score":0.7739999890327454},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6320000290870667},{"id":"https://openalex.org/C2780762811","wikidata":"https://www.wikidata.org/wiki/Q1784941","display_name":"Cosine similarity","level":3,"score":0.5846999883651733},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.5803999900817871},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.5263000130653381},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.5116000175476074},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4952000081539154},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.478300005197525},{"id":"https://openalex.org/C2777021972","wikidata":"https://www.wikidata.org/wiki/Q22976830","display_name":"Uniqueness","level":2,"score":0.47200000286102295},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4334000051021576},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.4323999881744385},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3571000099182129},{"id":"https://openalex.org/C143271835","wikidata":"https://www.wikidata.org/wiki/Q254515","display_name":"Similitude","level":2,"score":0.3447999954223633},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.3325999975204468},{"id":"https://openalex.org/C90312973","wikidata":"https://www.wikidata.org/wiki/Q7449052","display_name":"Semantic data model","level":2,"score":0.3037000000476837},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.30059999227523804},{"id":"https://openalex.org/C193519340","wikidata":"https://www.wikidata.org/wiki/Q891179","display_name":"Data loss","level":2,"score":0.29010000824928284},{"id":"https://openalex.org/C203519979","wikidata":"https://www.wikidata.org/wiki/Q865360","display_name":"Jaccard index","level":3,"score":0.2896000146865845},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.2851000130176544},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2831000089645386},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.27880001068115234},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.27059999108314514},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.26499998569488525},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.26030001044273376},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C198942812","wikidata":"https://www.wikidata.org/wiki/Q496618","display_name":"Semantic property","level":2,"score":0.25949999690055847},{"id":"https://openalex.org/C2129575","wikidata":"https://www.wikidata.org/wiki/Q54837","display_name":"Semantic Web","level":2,"score":0.25519999861717224},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.25099998712539673}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.06603","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.06603","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.06603","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.06603","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.687882125377655}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Data":[0],"duplication":[1,61],"during":[2,55],"pretraining":[3,146,201],"can":[4],"degrade":[5],"generalization":[6],"and":[7,206],"lead":[8],"to":[9,129,188,195],"memorization,":[10],"motivating":[11],"aggressive":[12],"deduplication":[13],"pipelines.":[14],"However,":[15,124],"at":[16,218],"web":[17],"scale,":[18],"it":[19],"is":[20,62],"unclear":[21],"what":[22],"constitutes":[23],"a":[24],"``duplicate'':":[25],"beyond":[26],"surface-form":[27],"matches,":[28],"semantically":[29,76],"equivalent":[30,77],"documents":[31,78,106,157],"(e.g.":[32],"translations)":[33],"may":[34],"induce":[35],"redundant":[36],"training":[37],"signals":[38],"once":[39],"models":[40],"become":[41,79],"sufficiently":[42],"capable.":[43],"Practically,":[44],"this":[45],"means":[46],"that":[47,60,88,159,185],"semantic":[48,97,142,197],"duplicates":[49,54],"operate":[50],"increasingly":[51],"like":[52],"exact":[53],"training.":[56],"We":[57,180],"present":[58],"evidence":[59],"scale-dependent":[63],"in":[64],"two":[65],"ways.":[66],"First,":[67],"as":[68,125],"model":[69],"capability":[70],"increases,":[71],"cross-entropy":[72],"loss":[73,171],"gradients":[74,87],"for":[75,165,173,214],"more":[80,215],"aligned.":[81],"Smaller":[82],"models,":[83,167,175],"by":[84],"contrast,":[85],"produce":[86],"reflect":[89],"surface":[90],"similarity":[91,115],"(e.g.,":[92],"shared":[93],"tokens)":[94],"rather":[95],"than":[96],"similarity.":[98],"Second,":[99],"we":[100],"embedded":[101],"all":[102],"192":[103],"million":[104],"FineWeb-Edu-Dedup":[105],"using":[107],"EmbeddingGemma-300m.":[108],"For":[109],"moderate":[110],"corpus":[111,126],"sizes,":[112],"the":[113,135,200],"cosine":[114],"between":[116],"nearest-neighbors":[117],"follows":[118],"an":[119,208],"isotropic":[120],"power":[121],"law":[122],"baseline.":[123],"size":[127],"grows":[128],"hundreds":[130],"of":[131,133,154,199,211],"billions":[132],"tokens,":[134],"nearest-neighbor":[136],"similarities":[137],"deviate":[138],"sharply,":[139],"indicating":[140],"accelerated":[141],"collisions.":[143],"Finally,":[144],"controlled":[145],"on":[147],"data":[148],"sampled":[149],"with":[150],"replacement":[151],"from":[152,191],"pools":[153],"finite":[155],"unique":[156],"shows":[158],"limited":[160,196],"uniqueness":[161,198],"yields":[162],"mild":[163],"degradation":[164],"small":[166],"but":[168],"rapidly":[169],"increasing":[170],"penalties":[172],"larger":[174],"breaking":[176],"naive":[177],"scaling":[178,183,193],"extrapolation.":[179],"derive":[181],"explicit":[182],"laws":[184],"allow":[186],"practitioners":[187],"estimate":[189],"deviation":[190],"expected":[192],"due":[194],"corpus.":[202],"Our":[203],"results":[204],"identify":[205],"resolve":[207],"unstudied":[209],"source":[210],"scale-dependence,":[212],"allowing":[213],"accurate":[216],"prediction":[217],"scale.":[219]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-03-11T00:00:00"}
