{"id":"https://openalex.org/W4389609547","doi":"https://doi.org/10.1145/3626762","title":"R2D2: Reducing Redundancy and Duplication in Data Lakes","display_name":"R2D2: Reducing Redundancy and Duplication in Data Lakes","publication_year":2023,"publication_date":"2023-12-08","ids":{"openalex":"https://openalex.org/W4389609547","doi":"https://doi.org/10.1145/3626762"},"language":"en","primary_location":{"id":"doi:10.1145/3626762","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3626762","pdf_url":null,"source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2312.13427","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5046282073","display_name":"Raunak Shah","orcid":"https://orcid.org/0000-0002-2889-7855"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Raunak Shah","raw_affiliation_strings":["University of Illinois, Urbana-Champaign, Champaign, IL, USA"],"affiliations":[{"raw_affiliation_string":"University of Illinois, Urbana-Champaign, Champaign, IL, USA","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048749345","display_name":"Koyel Mukherjee","orcid":"https://orcid.org/0000-0002-8690-323X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Koyel Mukherjee","raw_affiliation_strings":["Adobe Research, Bangalore, India"],"affiliations":[{"raw_affiliation_string":"Adobe Research, Bangalore, India","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056629319","display_name":"A. Tyagi","orcid":"https://orcid.org/0009-0008-3904-7953"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Atharv Tyagi","raw_affiliation_strings":["Adobe Research, Bangalore, India"],"affiliations":[{"raw_affiliation_string":"Adobe Research, Bangalore, India","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020728083","display_name":"Sai Keerthana Karnam","orcid":"https://orcid.org/0000-0003-1328-5167"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sai Keerthana Karnam","raw_affiliation_strings":["Adobe, Bangalore, India"],"affiliations":[{"raw_affiliation_string":"Adobe, Bangalore, India","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016226721","display_name":"Dhruv Joshi","orcid":"https://orcid.org/0009-0007-4543-846X"},"institutions":[{"id":"https://openalex.org/I145894827","display_name":"Indian Institute of Technology Kharagpur","ror":"https://ror.org/03w5sq511","country_code":"IN","type":"education","lineage":["https://openalex.org/I145894827"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Dhruv Joshi","raw_affiliation_strings":["Indian Institute of Technology Kharagpur, Kharagpur, India"],"affiliations":[{"raw_affiliation_string":"Indian Institute of Technology Kharagpur, Kharagpur, India","institution_ids":["https://openalex.org/I145894827"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113068983","display_name":"Shivam Pravin Bhosale","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shivam Pravin Bhosale","raw_affiliation_strings":["Adobe, Bangalore, India"],"affiliations":[{"raw_affiliation_string":"Adobe, Bangalore, India","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5065358142","display_name":"S. Mitra","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Subrata Mitra","raw_affiliation_strings":["Adobe Research, Bangalore, India"],"affiliations":[{"raw_affiliation_string":"Adobe Research, Bangalore, India","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5046282073"],"corresponding_institution_ids":["https://openalex.org/I157725225"],"apc_list":null,"apc_paid":null,"fwci":1.7648,"has_fulltext":true,"cited_by_count":7,"citation_normalized_percentile":{"value":0.86200034,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":"1","issue":"4","first_page":"1","last_page":"25"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11614","display_name":"Cloud Data Security Solutions","score":0.9916999936103821,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9904000163078308,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7978278398513794},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.6100456118583679},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5219544768333435},{"id":"https://openalex.org/keywords/enterprise-data-management","display_name":"Enterprise data management","score":0.486128568649292},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4334202706813812},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.33235546946525574},{"id":"https://openalex.org/keywords/enterprise-information-system","display_name":"Enterprise information system","score":0.12466928362846375},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.10411134362220764}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7978278398513794},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.6100456118583679},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5219544768333435},{"id":"https://openalex.org/C136227091","wikidata":"https://www.wikidata.org/wiki/Q5380376","display_name":"Enterprise data management","level":3,"score":0.486128568649292},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4334202706813812},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.33235546946525574},{"id":"https://openalex.org/C27295321","wikidata":"https://www.wikidata.org/wiki/Q831795","display_name":"Enterprise information system","level":2,"score":0.12466928362846375},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.10411134362220764}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3626762","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3626762","pdf_url":null,"source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2312.13427","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.13427","pdf_url":"https://arxiv.org/pdf/2312.13427","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2312.13427","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.13427","pdf_url":"https://arxiv.org/pdf/2312.13427","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4389609547.pdf"},"referenced_works_count":27,"referenced_works":["https://openalex.org/W297231882","https://openalex.org/W1474119323","https://openalex.org/W2098431880","https://openalex.org/W2568154862","https://openalex.org/W2606791715","https://openalex.org/W2612677505","https://openalex.org/W2613536717","https://openalex.org/W2798412430","https://openalex.org/W2798664493","https://openalex.org/W2948163032","https://openalex.org/W2950930616","https://openalex.org/W2963174348","https://openalex.org/W2965116190","https://openalex.org/W3014705052","https://openalex.org/W3032215537","https://openalex.org/W3107132553","https://openalex.org/W3108851861","https://openalex.org/W3123375411","https://openalex.org/W3143219376","https://openalex.org/W3164968002","https://openalex.org/W3174637548","https://openalex.org/W3177445587","https://openalex.org/W3196867679","https://openalex.org/W3208996364","https://openalex.org/W4210558765","https://openalex.org/W4375928372","https://openalex.org/W4380433117"],"related_works":["https://openalex.org/W3037187668","https://openalex.org/W1495042958","https://openalex.org/W2494338568","https://openalex.org/W2122678784","https://openalex.org/W4234772502","https://openalex.org/W2282510344","https://openalex.org/W139987158","https://openalex.org/W2183994405","https://openalex.org/W2380685755","https://openalex.org/W1875646599"],"abstract_inverted_index":{"Enterprise":[0],"data":[1,14,48,101,163,197],"lakes":[2,49],"often":[3],"suffer":[4],"from":[5,17],"substantial":[6],"amounts":[7],"of":[8,54,60,66,91,224,230,238],"duplicate":[9],"and":[10,28,43,117,130,168,226],"redundant":[11,136],"data,":[12],"with":[13,207],"volumes":[15],"ranging":[16],"terabytes":[18],"to":[19,23,177,188],"petabytes.":[20],"This":[21],"leads":[22],"both":[24,221],"increased":[25],"storage":[26,129],"costs":[27,32,132],"unnecessarily":[29],"high":[30,208],"maintenance":[31],"for":[33,161,172],"these":[34],"datasets.":[35],"In":[36,175],"this":[37,63],"work,":[38],"we":[39],"focus":[40],"on":[41,144,154,169,220],"identifying":[42,135],"reducing":[44,95],"redundancy":[45],"in":[46,99,165,203],"enterprise":[47,162,195,222],"by":[50,93,112,133],"addressing":[51],"the":[52,58,67,96,100,127,200,236],"problem":[53],"\"dataset":[55],"containment\".":[56],"To":[57],"best":[59],"our":[61,152,190,239],"knowledge,":[62],"is":[64],"one":[65],"first":[68,104],"works":[69],"that":[70,85,138,181],"addresses":[71],"table-level":[72],"containment":[73,92,109],"at":[74,199],"a":[75,81,107],"large":[76],"scale.":[77],"We":[78,123,150,210],"propose":[79,125],"R2D2:":[80],"three-step":[82],"hierarchical":[83],"pipeline":[84,191],"efficiently":[86],"identifies":[87],"almost":[88],"all":[89],"instances":[90],"progressively":[94],"search":[97],"space":[98],"lake.":[102],"It":[103],"builds":[105],"(i)":[106],"schema":[108],"graph,":[110],"followed":[111],"(ii)":[113],"statistical":[114],"min-max":[115],"pruning,":[116],"finally,":[118],"(iii)":[119],"content":[120],"level":[121],"pruning.":[122],"further":[124],"minimizing":[126],"total":[128],"access":[131],"optimally":[134],"datasets":[137,228],"can":[139,192],"be":[140],"deleted":[141],"(and":[142],"reconstructed":[143],"demand)":[145],"while":[146],"respecting":[147],"latency":[148],"constraints.":[149],"implement":[151],"system":[153],"Azure":[155],"Databricks":[156],"clusters":[157,171],"using":[158],"Apache":[159],"Spark":[160],"stored":[164],"ADLS":[166],"Gen2,":[167],"AWS":[170],"open-source":[173,227],"data.":[174],"contrast":[176],"existing":[178],"modified":[179],"baselines":[180],"are":[182],"inaccurate":[183],"or":[184],"take":[185],"several":[186],"days":[187],"run,":[189],"process":[193],"an":[194],"customer":[196],"lake":[198],"TB":[201],"scale":[202],"approximately":[204],"5":[205],"hours":[206],"accuracy.":[209],"present":[211],"theoretical":[212],"results":[213],"as":[214,216],"well":[215],"extensive":[217],"empirical":[218],"validation":[219],"(scale":[223,229],"TBs)":[225],"MBs":[231],"-":[232],"GBs),":[233],"which":[234],"showcase":[235],"effectiveness":[237],"pipeline.":[240]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
