{"id":"https://openalex.org/W4402042553","doi":"https://doi.org/10.14778/3681954.3682005","title":"Searching Data Lakes for Nested and Joined Data","display_name":"Searching Data Lakes for Nested and Joined Data","publication_year":2024,"publication_date":"2024-07-01","ids":{"openalex":"https://openalex.org/W4402042553","doi":"https://doi.org/10.14778/3681954.3682005"},"language":"en","primary_location":{"id":"doi:10.14778/3681954.3682005","is_oa":false,"landing_page_url":"http://dx.doi.org/10.14778/3681954.3682005","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100602517","display_name":"Yi Zhang","orcid":"https://orcid.org/0000-0001-8194-6057"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yi Zhang","raw_affiliation_strings":["AWS AI Labs"],"affiliations":[{"raw_affiliation_string":"AWS AI Labs","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002157683","display_name":"Peter Baile Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I4210109586","display_name":"Moscow Institute of Thermal Technology","ror":"https://ror.org/021es5e59","country_code":"RU","type":"facility","lineage":["https://openalex.org/I4210109586"]}],"countries":["RU"],"is_corresponding":false,"raw_author_name":"Peter Baile Chen","raw_affiliation_strings":["MIT"],"affiliations":[{"raw_affiliation_string":"MIT","institution_ids":["https://openalex.org/I4210109586"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5063822723","display_name":"Zachary G. Ives","orcid":"https://orcid.org/0000-0001-7527-2957"},"institutions":[{"id":"https://openalex.org/I36788626","display_name":"California University of Pennsylvania","ror":"https://ror.org/01spssf70","country_code":"US","type":"education","lineage":["https://openalex.org/I36788626"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zachary G. Ives","raw_affiliation_strings":["University of Pennsylvania"],"affiliations":[{"raw_affiliation_string":"University of Pennsylvania","institution_ids":["https://openalex.org/I36788626"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100602517"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.17566825,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"17","issue":"11","first_page":"3346","last_page":"3359"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.9919999837875366,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8246633410453796},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.5809794664382935},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5629099011421204},{"id":"https://openalex.org/keywords/heuristics","display_name":"Heuristics","score":0.499464750289917},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.49142634868621826},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.48088958859443665},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.47813618183135986},{"id":"https://openalex.org/keywords/data-integration","display_name":"Data integration","score":0.4534703195095062},{"id":"https://openalex.org/keywords/table","display_name":"Table (database)","score":0.45131799578666687},{"id":"https://openalex.org/keywords/data-set","display_name":"Data set","score":0.4196586608886719},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.39287376403808594},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.1495783030986786}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8246633410453796},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.5809794664382935},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5629099011421204},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.499464750289917},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.49142634868621826},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.48088958859443665},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.47813618183135986},{"id":"https://openalex.org/C72634772","wikidata":"https://www.wikidata.org/wiki/Q386824","display_name":"Data integration","level":2,"score":0.4534703195095062},{"id":"https://openalex.org/C45235069","wikidata":"https://www.wikidata.org/wiki/Q278425","display_name":"Table (database)","level":2,"score":0.45131799578666687},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.4196586608886719},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.39287376403808594},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.1495783030986786},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.14778/3681954.3682005","is_oa":false,"landing_page_url":"http://dx.doi.org/10.14778/3681954.3682005","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W1600580971","https://openalex.org/W2035274534","https://openalex.org/W2066806792","https://openalex.org/W2092364718","https://openalex.org/W2108014443","https://openalex.org/W2108223890","https://openalex.org/W2341748398","https://openalex.org/W2438792749","https://openalex.org/W2501364796","https://openalex.org/W2575168421","https://openalex.org/W2795089200","https://openalex.org/W2810954846","https://openalex.org/W2889133671","https://openalex.org/W2926805670","https://openalex.org/W2941232686","https://openalex.org/W2948163032","https://openalex.org/W2963174348","https://openalex.org/W2963469388","https://openalex.org/W2970992672","https://openalex.org/W3014616325","https://openalex.org/W3021274480","https://openalex.org/W3029367893","https://openalex.org/W3032215537","https://openalex.org/W3166632383","https://openalex.org/W3174637548","https://openalex.org/W4281826654","https://openalex.org/W4321448337","https://openalex.org/W4375928372","https://openalex.org/W4379390735","https://openalex.org/W4380433117","https://openalex.org/W4383051975","https://openalex.org/W4385569963","https://openalex.org/W4385653220","https://openalex.org/W6753529518"],"related_works":["https://openalex.org/W2280422768","https://openalex.org/W3143197806","https://openalex.org/W4252555497","https://openalex.org/W3121175838","https://openalex.org/W3016293053","https://openalex.org/W1690653314","https://openalex.org/W2401723157","https://openalex.org/W3024364549","https://openalex.org/W2065055572","https://openalex.org/W2784269775"],"abstract_inverted_index":{"Exploratory":[0],"data":[1,9,28,40,44,58,77,101,110,165,167,202,221],"science":[2,78,168,222],"is":[3,125],"driving":[4],"new":[5],"platforms":[6],"that":[7,92],"assist":[8],"scientists":[10],"with":[11,53,71,175],"everyday":[12],"tasks,":[13,224],"such":[14],"as":[15,29],"integration":[16],"and":[17,36,95,143,145,151,160,192,199,233],"wrangling,":[18],"to":[19,67,134,189,197,207,213,220,231,239],"assemble":[20],"training":[21],"datasets.":[22],"Such":[23],"tools":[24,61],"take":[25],"scientists'":[26],"work-in-progress":[27],"a":[30,72,126,210],"search":[31,60,73,112,149],"object":[32],"(table":[33],"or":[34,51,69],"JSON)":[35],"find":[37,62],"relevant":[38],"supplementary":[39],"from":[41,166],"an":[42],"organizational":[43],"lake":[45,59,111],",":[46,64],"which":[47,84],"can":[48,85,181],"be":[49,87],"unioned":[50],"joined":[52],"the":[54,100,108,183,201,215],"current":[55],"data.":[56],"Existing":[57],"single":[63],"relational":[65],"tables":[66,98],"match":[68,135],"join":[70,94],"object.":[74],"Yet":[75],"many":[76],"applications":[79],"revolve":[80],"around":[81],"hierarchical":[82,136],"data,":[83,137],"only":[86],"matched":[88],"by":[89,205,229,237],"creating":[90],"views":[91,187],"simultaneously":[93],"transform":[96],"several":[97],"in":[99],"lake.":[102],"In":[103,209],"this":[104,116],"paper,":[105],"we":[106,225],"extend":[107],"Juneau":[109],"system":[113],"[46]":[114],"for":[115,129,141],"broader":[117],"class":[118],"of":[119,186,217],"matches":[120],"at":[121],"scale.":[122],"Our":[123,171],"contribution":[124],"general":[127],"framework":[128],"efficiently":[130],"merging":[131],"ranked":[132],"results":[133,172,219],"leveraging":[138],"novel":[139],"techniques":[140,150],"indexing":[142],"sketching,":[144],"incorporating":[146],"existing":[147],"single-table":[148],"ranking":[152,177],"functions.":[153],"We":[154],"experimentally":[155],"validate":[156],"our":[157,179,218],"methods'":[158],"benefits":[159],"broad":[161],"applicability":[162],"using":[163],"real":[164],"computational":[169],"notebooks.":[170],"indicate":[173],"that,":[174],"different":[176],"functions,":[178],"approach":[180],"return":[182],"optimal":[184],"set":[185],"up":[188,206,230,238],"4.8x":[190],"faster":[191],"43%":[193],"more":[194],"related":[195],"compared":[196],"heuristics,":[198],"increase":[200],"domain":[203],"coverage":[204],"28%.":[208],"case":[211],"study":[212],"show":[214],"utility":[216],"downstream":[223],"reduce":[226],"regression":[227],"error":[228],"6.6%,":[232],"improve":[234],"classification":[235],"accuracy":[236],"19.5%.":[240]},"counts_by_year":[],"updated_date":"2025-12-19T19:40:27.379048","created_date":"2025-10-10T00:00:00"}
