{"id":"https://openalex.org/W4411374464","doi":"https://doi.org/10.1145/3722212.3724430","title":"A <scp>uto</scp> C <scp>omp</scp> : Automated Data Compaction for Log-Structured Tables in Data Lakes","display_name":"A <scp>uto</scp> C <scp>omp</scp> : Automated Data Compaction for Log-Structured Tables in Data Lakes","publication_year":2025,"publication_date":"2025-06-17","ids":{"openalex":"https://openalex.org/W4411374464","doi":"https://doi.org/10.1145/3722212.3724430"},"language":"en","primary_location":{"id":"doi:10.1145/3722212.3724430","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3722212.3724430","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion of the 2025 International Conference on Management of Data","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3722212.3724430","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5043472177","display_name":"Anja Gruenheid","orcid":"https://orcid.org/0009-0009-2547-8610"},"institutions":[{"id":"https://openalex.org/I4210139986","display_name":"Microsoft (Switzerland)","ror":"https://ror.org/03zryq964","country_code":"CH","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210139986"]}],"countries":["CH"],"is_corresponding":true,"raw_author_name":"Anja Gruenheid","raw_affiliation_strings":["Microsoft, Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"Microsoft, Zurich, Switzerland","institution_ids":["https://openalex.org/I4210139986"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058019805","display_name":"Jes\u00fas Camacho-Rodr\u00edguez","orcid":"https://orcid.org/0009-0008-9151-6024"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jes\u00fas Camacho-Rodr\u00edguez","raw_affiliation_strings":["Microsoft, Mountain View, CA, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, Mountain View, CA, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010481434","display_name":"Carlo Curino","orcid":"https://orcid.org/0000-0003-3712-7358"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Carlo Curino","raw_affiliation_strings":["Microsoft, Redmond, WA, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051301731","display_name":"Raghu Ramakrishnan","orcid":"https://orcid.org/0009-0007-5086-7664"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Raghu Ramakrishnan","raw_affiliation_strings":["Microsoft, Redmond, WA, USA"],"affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, WA, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014959047","display_name":"Stanislav Pak","orcid":null},"institutions":[{"id":"https://openalex.org/I1316064682","display_name":"LinkedIn (United States)","ror":"https://ror.org/02fyxhe35","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I1316064682"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Stanislav Pak","raw_affiliation_strings":["LinkedIn, Sunnyvale, CA, USA"],"affiliations":[{"raw_affiliation_string":"LinkedIn, Sunnyvale, CA, USA","institution_ids":["https://openalex.org/I1316064682"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5118325361","display_name":"Sumedh Sakdeo","orcid":"https://orcid.org/0009-0005-4677-5885"},"institutions":[{"id":"https://openalex.org/I1316064682","display_name":"LinkedIn (United States)","ror":"https://ror.org/02fyxhe35","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I1316064682"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sumedh Sakdeo","raw_affiliation_strings":["LinkedIn, Sunnyvale, CA, USA"],"affiliations":[{"raw_affiliation_string":"LinkedIn, Sunnyvale, CA, USA","institution_ids":["https://openalex.org/I1316064682"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5118325362","display_name":"Lenisha Gandhi","orcid":null},"institutions":[{"id":"https://openalex.org/I1316064682","display_name":"LinkedIn (United States)","ror":"https://ror.org/02fyxhe35","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I1316064682"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lenisha Gandhi","raw_affiliation_strings":["LinkedIn, Sunnyvale, CA, USA"],"affiliations":[{"raw_affiliation_string":"LinkedIn, Sunnyvale, CA, USA","institution_ids":["https://openalex.org/I1316064682"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5118273622","display_name":"Sandeep K. Singhal","orcid":null},"institutions":[{"id":"https://openalex.org/I1316064682","display_name":"LinkedIn (United States)","ror":"https://ror.org/02fyxhe35","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I1316064682"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sandeep K. Singhal","raw_affiliation_strings":["LinkedIn, Sunnyvale, CA, USA"],"affiliations":[{"raw_affiliation_string":"LinkedIn, Sunnyvale, CA, USA","institution_ids":["https://openalex.org/I1316064682"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5118273623","display_name":"Pooja Nilangekar","orcid":null},"institutions":[{"id":"https://openalex.org/I66946132","display_name":"University of Maryland, College Park","ror":"https://ror.org/047s2c258","country_code":"US","type":"education","lineage":["https://openalex.org/I66946132"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Pooja Nilangekar","raw_affiliation_strings":["University of Maryland, College Park, MD, USA"],"affiliations":[{"raw_affiliation_string":"University of Maryland, College Park, MD, USA","institution_ids":["https://openalex.org/I66946132"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5049333271","display_name":"Daniel J. Abadi","orcid":"https://orcid.org/0000-0003-3771-2995"},"institutions":[{"id":"https://openalex.org/I66946132","display_name":"University of Maryland, College Park","ror":"https://ror.org/047s2c258","country_code":"US","type":"education","lineage":["https://openalex.org/I66946132"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Daniel J. Abadi","raw_affiliation_strings":["University of Maryland, College Park, MD, USA"],"affiliations":[{"raw_affiliation_string":"University of Maryland, College Park, MD, USA","institution_ids":["https://openalex.org/I66946132"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5043472177"],"corresponding_institution_ids":["https://openalex.org/I4210139986"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.14661474,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"404","last_page":"417"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/compaction","display_name":"Compaction","score":0.47782400250434875},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.3658882975578308},{"id":"https://openalex.org/keywords/geology","display_name":"Geology","score":0.13050112128257751}],"concepts":[{"id":"https://openalex.org/C196715460","wikidata":"https://www.wikidata.org/wiki/Q1414356","display_name":"Compaction","level":2,"score":0.47782400250434875},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.3658882975578308},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.13050112128257751},{"id":"https://openalex.org/C187320778","wikidata":"https://www.wikidata.org/wiki/Q1349130","display_name":"Geotechnical engineering","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3722212.3724430","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3722212.3724430","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion of the 2025 International Conference on Management of Data","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2504.04186","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2504.04186","pdf_url":"https://arxiv.org/pdf/2504.04186","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3722212.3724430","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3722212.3724430","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion of the 2025 International Conference on Management of Data","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/13","display_name":"Climate action","score":0.5099999904632568}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W2014977566","https://openalex.org/W2035735180","https://openalex.org/W2159886933","https://openalex.org/W2243935923","https://openalex.org/W2426624872","https://openalex.org/W2612261081","https://openalex.org/W2925032266","https://openalex.org/W3197204476","https://openalex.org/W4366492460","https://openalex.org/W4398233958","https://openalex.org/W4404181225","https://openalex.org/W4404181433"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W3013719031","https://openalex.org/W3140187064","https://openalex.org/W2028550458","https://openalex.org/W2378755530","https://openalex.org/W585895904","https://openalex.org/W2004528270","https://openalex.org/W2485606874"],"abstract_inverted_index":{"The":[0],"proliferation":[1],"of":[2,53,107,122,188],"small":[3,55,123],"files":[4,56],"in":[5,21,165],"data":[6,101,109,161,201],"lakes":[7],"poses":[8],"significant":[9,163],"challenges,":[10],"including":[11],"degraded":[12],"query":[13,170],"performance,":[14],"increased":[15],"storage":[16,23],"costs,":[17],"and":[18,35,47,71,78,88,133,146,160,169,196],"scalability":[19,72],"bottlenecks":[20],"distributed":[22],"systems.":[24],"Log-structured":[25],"table":[26],"formats":[27],"(LSTs)":[28],"such":[29],"as":[30],"Delta":[31],"Lake,":[32],"Apache":[33,36],"Iceberg,":[34],"Hudi":[37],"exacerbate":[38],"this":[39,91],"issue":[40],"due":[41],"to":[42,73,75,104],"their":[43],"append-only":[44],"write":[45],"patterns":[46],"metadata-intensive":[48],"operations.":[49],"While":[50],"compaction--the":[51],"process":[52],"consolidating":[54],"into":[57],"fewer,":[58],"larger":[59],"files--is":[60],"a":[61,96,178],"common":[62],"solution,":[63],"existing":[64],"automation":[65],"mechanisms":[66],"often":[67],"lack":[68],"the":[69,83,105,119],"flexibility":[70],"adapt":[74],"diverse":[76],"workloads":[77],"system":[79],"requirements":[80,128],"while":[81],"balancing":[82],"trade-offs":[84],"between":[85],"compaction":[86,102,183,194],"benefits":[87],"costs.":[89],"In":[90],"paper,":[92],"we":[93,117],"present":[94],"AutoComp,":[95],"scalable":[97],"framework":[98],"for":[99,129,155,181,199],"automatic":[100,131],"tailored":[103],"needs":[106],"modern":[108],"lakes.":[110],"Drawing":[111],"on":[112],"deployment":[113],"experience":[114],"at":[115],"LinkedIn,":[116],"analyze":[118],"operational":[120],"impact":[121],"file":[124,166],"proliferation,":[125],"establish":[126],"key":[127],"effective":[130],"compaction,":[132],"demonstrate":[134],"how":[135],"AutoComp":[136],"addresses":[137],"these":[138],"challenges.":[139],"Our":[140],"evaluation,":[141],"conducted":[142],"using":[143],"synthetic":[144],"benchmarks":[145],"production":[147],"environments":[148],"via":[149],"integration":[150,187],"with":[151],"OpenHouse--a":[152],"control":[153],"plane":[154],"catalog":[156],"management,":[157],"schema":[158],"governance,":[159],"services--shows":[162],"improvements":[164],"count":[167],"reduction":[168],"performance.":[171],"We":[172],"believe":[173],"AutoComp's":[174],"built-in":[175],"extensibility":[176],"provides":[177],"robust":[179],"foundation":[180],"evolving":[182],"systems,":[184],"facilitating":[185],"future":[186],"refined":[189],"multi-objective":[190],"optimization":[191],"approaches,":[192],"workload-aware":[193],"strategies,":[195],"expanded":[197],"support":[198],"broader":[200],"layout":[202],"optimizations.":[203]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-06-18T00:00:00"}
