{"id":"https://openalex.org/W4404404699","doi":"https://doi.org/10.48550/arxiv.2411.04257","title":"LSHBloom: Memory-efficient, Extreme-scale Document Deduplication","display_name":"LSHBloom: Memory-efficient, Extreme-scale Document Deduplication","publication_year":2024,"publication_date":"2024-11-06","ids":{"openalex":"https://openalex.org/W4404404699","doi":"https://doi.org/10.48550/arxiv.2411.04257"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2411.04257","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.04257","pdf_url":"https://arxiv.org/pdf/2411.04257","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2411.04257","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5057750373","display_name":"Arham Khan","orcid":"https://orcid.org/0009-0006-6960-6651"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Khan, Arham","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014903156","display_name":"Robert Underwood","orcid":"https://orcid.org/0000-0002-4456-1251"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Underwood, Robert","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100422121","display_name":"Carlo Siebenschuh","orcid":"https://orcid.org/0000-0003-0215-3716"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Siebenschuh, Carlo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001788942","display_name":"Yadu Babuji","orcid":"https://orcid.org/0000-0002-9162-6003"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Babuji, Yadu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008568853","display_name":"Aswathy Ajith","orcid":"https://orcid.org/0000-0001-7812-2962"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ajith, Aswathy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077184414","display_name":"Kyle Hippe","orcid":"https://orcid.org/0000-0001-9470-572X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hippe, Kyle","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015717928","display_name":"Ozan G\u00f6kdemir","orcid":"https://orcid.org/0000-0001-5299-1983"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gokdemir, Ozan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059741659","display_name":"Alexander Brace","orcid":"https://orcid.org/0000-0001-9873-9177"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brace, Alexander","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065464552","display_name":"Kyle Chard","orcid":"https://orcid.org/0000-0002-7370-4805"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chard, Kyle","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5102971677","display_name":"Ian Foster","orcid":"https://orcid.org/0009-0001-7540-1218"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Foster, Ian","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5057750373"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11614","display_name":"Cloud Data Security Solutions","score":0.9812999963760376,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11614","display_name":"Cloud Data Security Solutions","score":0.9812999963760376,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.97079998254776,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/data-deduplication","display_name":"Data deduplication","score":0.9062215685844421},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6181735396385193},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.5193858742713928},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.34426647424697876},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3331902027130127},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.28723371028900146},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.11341556906700134},{"id":"https://openalex.org/keywords/cartography","display_name":"Cartography","score":0.07021668553352356}],"concepts":[{"id":"https://openalex.org/C32587265","wikidata":"https://www.wikidata.org/wiki/Q1182260","display_name":"Data deduplication","level":2,"score":0.9062215685844421},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6181735396385193},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.5193858742713928},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.34426647424697876},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3331902027130127},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.28723371028900146},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.11341556906700134},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.07021668553352356}],"mesh":[],"locations_count":3,"locations":[{"id":"pmh:oai:arXiv.org:2411.04257","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.04257","pdf_url":"https://arxiv.org/pdf/2411.04257","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:doi:10.48550/arxiv.2411.04257","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2411.04257","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2411.04257","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2411.04257","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2411.04257","pdf_url":"https://arxiv.org/pdf/2411.04257","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4404404699.pdf","grobid_xml":"https://content.openalex.org/works/W4404404699.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W3144870715","https://openalex.org/W3142319788","https://openalex.org/W2587188779","https://openalex.org/W3132870970","https://openalex.org/W4385804830","https://openalex.org/W2943088381","https://openalex.org/W2074021203"],"abstract_inverted_index":{"Contemporary":[0],"large":[1],"language":[2],"model":[3],"(LLM)":[4],"training":[5,51,59,62],"pipelines":[6],"require":[7],"the":[8,39,58,112,121,179,191],"assembly":[9],"of":[10,14,20,38,98,182,194],"internet-scale":[11,220],"databases":[12],"full":[13],"text":[15,221],"data":[16],"from":[17],"a":[18,44,130,209],"variety":[19],"sources":[21],"(e.g.,":[22],"web,":[23],"academic,":[24],"and":[25,34,49,64,101,174],"publishers).":[26],"Preprocessing":[27],"these":[28],"datasets":[29,52],"via":[30],"deduplication":[31,84,124,192,218],"--":[32,42],"detecting":[33],"eliminating":[35],"additional":[36],"instances":[37],"same":[40,122],"content":[41],"is":[43],"major":[45],"focus":[46],"for":[47,53,203],"assembling":[48],"curating":[50],"LLMs.":[54],"Unrestrained,":[55],"duplicates":[56],"in":[57,72,96,133,138,172],"dataset":[60],"increase":[61,132],"costs":[63],"lead":[65],"to":[66,82,108,189,213,219],"undesirable":[67],"properties":[68],"such":[69],"as":[70,126],"memorization":[71],"trained":[73],"models":[74],"or":[75,93],"cheating":[76],"on":[77,149,162,165],"evaluation.":[78],"Unfortunately,":[79],"contemporary":[80],"approaches":[81],"document-level":[83],"are":[85,199],"either":[86],"unreliable":[87],"at":[88,178,196],"accurately":[89],"identifying":[90],"duplicate":[91],"documents":[92],"extremely":[94],"expensive":[95,113],"terms":[97],"both":[99],"runtime":[100,144,175],"memory.":[102],"We":[103],"propose":[104],"LSHBloom,":[105],"an":[106],"extension":[107],"MinhashLSH,":[109,127],"which":[110],"replaces":[111],"LSHIndex":[114],"with":[115,128],"lightweight":[116],"Bloom":[117],"filters.":[118],"LSHBloom":[119,186,211],"demonstrates":[120],"state-of-the-art":[123],"performance":[125],"only":[129,201],"marginal":[131],"false":[134],"positives":[135],"(near":[136],"zero":[137],"our":[139],"experiments),":[140],"while":[141],"boasting":[142],"competitive":[143],"(12$\\times$":[145],"faster":[146],"than":[147,158],"MinhashLSH":[148,159],"peS2o)":[150],"and,":[151],"crucially,":[152],"using":[153],"18$\\times$":[154],"less":[155,204],"disk":[156],"space":[157,173],"(as":[160],"measured":[161],"peS2o).":[163],"Based":[164],"extrapolation,":[166],"we":[167],"show":[168],"that":[169,198],"this":[170],"advantage":[171],"remains":[176],"even":[177],"extreme":[180],"scale":[181],"several":[183],"billion":[184],"documents.":[185],"allows":[187],"practitioners":[188],"access":[190],"quality":[193],"MinHashLSH":[195],"scales":[197],"normally":[200],"tractable":[202],"sophisticated,":[205],"heuristic":[206],"solutions.":[207],"As":[208],"result,":[210],"promises":[212],"enable":[214],"scaling":[215],"high-quality":[216],"document":[217],"datasets.":[222]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
