{"id":"https://openalex.org/W4381328570","doi":"https://doi.org/10.1145/3589324","title":"Near-Duplicate Sequence Search at Scale for Large Language Model Memorization Evaluation","display_name":"Near-Duplicate Sequence Search at Scale for Large Language Model Memorization Evaluation","publication_year":2023,"publication_date":"2023-06-13","ids":{"openalex":"https://openalex.org/W4381328570","doi":"https://doi.org/10.1145/3589324"},"language":"en","primary_location":{"id":"doi:10.1145/3589324","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3589324","pdf_url":null,"source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5045267283","display_name":"Zhencan Peng","orcid":"https://orcid.org/0000-0003-4182-0075"},"institutions":[{"id":"https://openalex.org/I102322142","display_name":"Rutgers, The State University of New Jersey","ror":"https://ror.org/05vt9qd57","country_code":"US","type":"education","lineage":["https://openalex.org/I102322142"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Zhencan Peng","raw_affiliation_strings":["Rutgers University, New Brunswick, NJ, USA"],"raw_orcid":"https://orcid.org/0000-0003-4182-0075","affiliations":[{"raw_affiliation_string":"Rutgers University, New Brunswick, NJ, USA","institution_ids":["https://openalex.org/I102322142"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101850444","display_name":"Zhizhi Wang","orcid":"https://orcid.org/0000-0003-2223-9621"},"institutions":[{"id":"https://openalex.org/I102322142","display_name":"Rutgers, The State University of New Jersey","ror":"https://ror.org/05vt9qd57","country_code":"US","type":"education","lineage":["https://openalex.org/I102322142"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhizhi Wang","raw_affiliation_strings":["Rutgers University, New Brunswick, NJ, USA"],"raw_orcid":"https://orcid.org/0000-0003-2223-9621","affiliations":[{"raw_affiliation_string":"Rutgers University, New Brunswick, NJ, USA","institution_ids":["https://openalex.org/I102322142"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103237059","display_name":"Dong Deng","orcid":"https://orcid.org/0000-0002-4596-3850"},"institutions":[{"id":"https://openalex.org/I102322142","display_name":"Rutgers, The State University of New Jersey","ror":"https://ror.org/05vt9qd57","country_code":"US","type":"education","lineage":["https://openalex.org/I102322142"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dong Deng","raw_affiliation_strings":["Rutgers University, New Brunswick, NJ, USA"],"raw_orcid":"https://orcid.org/0000-0002-4596-3850","affiliations":[{"raw_affiliation_string":"Rutgers University, New Brunswick, NJ, USA","institution_ids":["https://openalex.org/I102322142"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5045267283"],"corresponding_institution_ids":["https://openalex.org/I102322142"],"apc_list":null,"apc_paid":null,"fwci":1.9725,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.86755999,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":"1","issue":"2","first_page":"1","last_page":"18"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9916999936103821,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7921513319015503},{"id":"https://openalex.org/keywords/hash-function","display_name":"Hash function","score":0.6918776631355286},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.6554389595985413},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6503174304962158},{"id":"https://openalex.org/keywords/memorization","display_name":"Memorization","score":0.6103652119636536},{"id":"https://openalex.org/keywords/perfect-hash-function","display_name":"Perfect hash function","score":0.4475267827510834},{"id":"https://openalex.org/keywords/constraint","display_name":"Constraint (computer-aided design)","score":0.44522330164909363},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.4121795892715454},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.41035863757133484},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4074459969997406},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3908231854438782},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3570767343044281},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.33885568380355835},{"id":"https://openalex.org/keywords/hash-table","display_name":"Hash table","score":0.31319183111190796},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.12885421514511108},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.12478700280189514}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7921513319015503},{"id":"https://openalex.org/C99138194","wikidata":"https://www.wikidata.org/wiki/Q183427","display_name":"Hash function","level":2,"score":0.6918776631355286},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.6554389595985413},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6503174304962158},{"id":"https://openalex.org/C30038468","wikidata":"https://www.wikidata.org/wiki/Q4354775","display_name":"Memorization","level":2,"score":0.6103652119636536},{"id":"https://openalex.org/C87431388","wikidata":"https://www.wikidata.org/wiki/Q2070573","display_name":"Perfect hash function","level":4,"score":0.4475267827510834},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.44522330164909363},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.4121795892715454},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.41035863757133484},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4074459969997406},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3908231854438782},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3570767343044281},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.33885568380355835},{"id":"https://openalex.org/C67388219","wikidata":"https://www.wikidata.org/wiki/Q207440","display_name":"Hash table","level":3,"score":0.31319183111190796},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.12885421514511108},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.12478700280189514},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C145420912","wikidata":"https://www.wikidata.org/wiki/Q853077","display_name":"Mathematics education","level":1,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3589324","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3589324","pdf_url":null,"source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.4699999988079071}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":39,"referenced_works":["https://openalex.org/W46679369","https://openalex.org/W166877056","https://openalex.org/W179875071","https://openalex.org/W1509577953","https://openalex.org/W1963838724","https://openalex.org/W1968625547","https://openalex.org/W1973001156","https://openalex.org/W1991175610","https://openalex.org/W2007842132","https://openalex.org/W2024680396","https://openalex.org/W2026968007","https://openalex.org/W2065259291","https://openalex.org/W2067432306","https://openalex.org/W2097776316","https://openalex.org/W2111549955","https://openalex.org/W2119455368","https://openalex.org/W2121269638","https://openalex.org/W2129750215","https://openalex.org/W2132069633","https://openalex.org/W2134212491","https://openalex.org/W2139660688","https://openalex.org/W2148578434","https://openalex.org/W2164634022","https://openalex.org/W2167302605","https://openalex.org/W2294331997","https://openalex.org/W2403774614","https://openalex.org/W2430378630","https://openalex.org/W2462305634","https://openalex.org/W2489320908","https://openalex.org/W2798412430","https://openalex.org/W2912924812","https://openalex.org/W3177445587","https://openalex.org/W3184324824","https://openalex.org/W4281483318","https://openalex.org/W4282565958","https://openalex.org/W4288057780","https://openalex.org/W4292779060","https://openalex.org/W6601894380","https://openalex.org/W6778883912"],"related_works":["https://openalex.org/W2097286495","https://openalex.org/W2080388000","https://openalex.org/W2065331859","https://openalex.org/W2155123971","https://openalex.org/W1897694601","https://openalex.org/W1845395494","https://openalex.org/W2144265691","https://openalex.org/W1835589799","https://openalex.org/W1605991620","https://openalex.org/W4385261619"],"abstract_inverted_index":{"Recent":[0],"studies":[1,46],"show":[2,265],"that":[3,26,120,208,266],"large":[4,165,260],"language":[5],"models":[6],"(LLM)":[7],"unintendedly":[8],"memorize":[9],"part":[10,37],"of":[11,29,38,73,76,85,123,159,178],"the":[12,41,50,79,90,121,131,156,160,170,175,180,196,202,226,240,247],"training":[13,42,80,263],"data,":[14],"which":[15],"brings":[16],"serious":[17],"privacy":[18],"risks.":[19],"For":[20],"example,":[21],"it":[22],"has":[23],"been":[24],"shown":[25],"over":[27],"1%":[28],"tokens":[30,74,186,222],"generated":[31,63,216],"unprompted":[32],"by":[33,70,95],"an":[34,140],"LLM":[35,262],"are":[36,104,191,215,231],"sequences":[39,124,158,181,241],"in":[40,78,125,148,163,195,198,223],"data.":[43],"However,":[44],"current":[45],"mainly":[47],"focus":[48],"on":[49,106,257],"exact":[51],"memorization":[52],"behaviors.":[53],"In":[54],"this":[55,87,136,149],"paper,":[56],"we":[57,138,237],"propose":[58],"to":[59,113,130,201],"evaluate":[60],"how":[61],"many":[62],"texts":[64],"have":[65],"near-duplicates":[66,190],"(e.g.,":[67],"only":[68,209],"differ":[69],"a":[71,126,164,218,234,258],"couple":[72],"out":[75],"100)":[77],"corpus.":[81],"A":[82],"major":[83],"challenge":[84],"conducting":[86],"evaluation":[88],"is":[89,100,119,128,272],"huge":[91],"computation":[92],"cost":[93],"incurred":[94],"near-duplicate":[96,144,157,268],"sequence":[97,145,162,269],"searches.":[98],"This":[99],"because":[101],"modern":[102],"LLMs":[103],"trained":[105],"larger":[107,109],"and":[108,142,173,229,252,274],"corpora":[110,264],"with":[111,167,182,220,246],"up":[112],"1":[114],"trillion":[115],"tokens.":[116],"What's":[117],"worse":[118],"number":[122],"text":[127,132,219],"quadratic":[129],"length.":[133],"To":[134],"address":[135],"issue,":[137],"develop":[139],"efficient":[141,273],"scalable":[143],"search":[146,270],"algorithm":[147,171,271],"paper.":[150],"It":[151],"can":[152],"find":[153,238],"(almost)":[154],"all":[155,179,239],"query":[161,235,248],"corpus":[166,197,203],"guarantees.":[168],"Specifically,":[169],"generates":[172],"groups":[174],"min-hash":[176,213,244],"values":[177,214,245],"at":[183],"least":[184],"t":[185],"(as":[187],"very":[188],"short":[189],"often":[192],"irrelevant":[193],"noise)":[194],"linear":[199],"time":[200,228],"size.":[204],"We":[205],"formally":[206],"prove":[207],"2":[210],"n+1/t+1":[211],"-1":[212],"for":[217],"n":[221],"expectation.":[224],"Thus":[225],"index":[227],"size":[230],"reasonable.":[232],"When":[233],"arrives,":[236],"sharing":[242],"enough":[243],"using":[249],"inverted":[250],"indexes":[251],"prefix":[253],"filtering.":[254],"Extensive":[255],"experiments":[256],"few":[259],"real-world":[261],"our":[267],"scalable.":[275]},"counts_by_year":[{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
