{"id":"https://openalex.org/W4375928372","doi":"https://doi.org/10.14778/3587136.3587146","title":"Semantics-Aware Dataset Discovery from Data Lakes with Contextualized Column-Based Representation Learning","display_name":"Semantics-Aware Dataset Discovery from Data Lakes with Contextualized Column-Based Representation Learning","publication_year":2023,"publication_date":"2023-03-01","ids":{"openalex":"https://openalex.org/W4375928372","doi":"https://doi.org/10.14778/3587136.3587146"},"language":"en","primary_location":{"id":"doi:10.14778/3587136.3587146","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3587136.3587146","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049651913","display_name":"Grace Fan","orcid":"https://orcid.org/0000-0001-9020-3642"},"institutions":[{"id":"https://openalex.org/I12912129","display_name":"Northeastern University","ror":"https://ror.org/04t5xt781","country_code":"US","type":"education","lineage":["https://openalex.org/I12912129"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Grace Fan","raw_affiliation_strings":["Northeastern University, United States"],"affiliations":[{"raw_affiliation_string":"Northeastern University, United States","institution_ids":["https://openalex.org/I12912129"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100346150","display_name":"Jin Wang","orcid":"https://orcid.org/0000-0002-3172-6133"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin Wang","raw_affiliation_strings":["Megagon Labs, United States"],"affiliations":[{"raw_affiliation_string":"Megagon Labs, United States","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100750716","display_name":"Yuliang Li","orcid":"https://orcid.org/0000-0002-0602-149X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuliang Li","raw_affiliation_strings":["Megagon Labs, United States"],"affiliations":[{"raw_affiliation_string":"Megagon Labs, United States","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100456041","display_name":"Dan Zhang","orcid":"https://orcid.org/0000-0002-7295-4837"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dan Zhang","raw_affiliation_strings":["Megagon Labs, United States"],"affiliations":[{"raw_affiliation_string":"Megagon Labs, United States","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5022619313","display_name":"Ren\u00e9e J. Miller","orcid":"https://orcid.org/0000-0002-1484-4787"},"institutions":[{"id":"https://openalex.org/I12912129","display_name":"Northeastern University","ror":"https://ror.org/04t5xt781","country_code":"US","type":"education","lineage":["https://openalex.org/I12912129"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ren\u00e9e J. Miller","raw_affiliation_strings":["Northeastern University, United States"],"affiliations":[{"raw_affiliation_string":"Northeastern University, United States","institution_ids":["https://openalex.org/I12912129"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5049651913"],"corresponding_institution_ids":["https://openalex.org/I12912129"],"apc_list":null,"apc_paid":null,"fwci":15.5813,"has_fulltext":false,"cited_by_count":62,"citation_normalized_percentile":{"value":0.9939014,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"16","issue":"7","first_page":"1726","last_page":"1739"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9955999851226807,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9825999736785889,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8004230856895447},{"id":"https://openalex.org/keywords/column","display_name":"Column (typography)","score":0.6751010417938232},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.5775144100189209},{"id":"https://openalex.org/keywords/table","display_name":"Table (database)","score":0.553671658039093},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.452411413192749},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.45124053955078125},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.41729676723480225},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4157223403453827},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4125576615333557},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.39554041624069214},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3866143822669983},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.34277474880218506}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8004230856895447},{"id":"https://openalex.org/C2780551164","wikidata":"https://www.wikidata.org/wiki/Q2306599","display_name":"Column (typography)","level":3,"score":0.6751010417938232},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.5775144100189209},{"id":"https://openalex.org/C45235069","wikidata":"https://www.wikidata.org/wiki/Q278425","display_name":"Table (database)","level":2,"score":0.553671658039093},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.452411413192749},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.45124053955078125},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.41729676723480225},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4157223403453827},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4125576615333557},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39554041624069214},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3866143822669983},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34277474880218506},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.14778/3587136.3587146","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3587136.3587146","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W1969621019","https://openalex.org/W1976022204","https://openalex.org/W1996505782","https://openalex.org/W2012833704","https://openalex.org/W2092364718","https://openalex.org/W2108223890","https://openalex.org/W2111869785","https://openalex.org/W2140116426","https://openalex.org/W2295598076","https://openalex.org/W2341748398","https://openalex.org/W2529049456","https://openalex.org/W2752618741","https://openalex.org/W2798664493","https://openalex.org/W2810954846","https://openalex.org/W2889003264","https://openalex.org/W2951438725","https://openalex.org/W2952479794","https://openalex.org/W2963174348","https://openalex.org/W2963469388","https://openalex.org/W2970992672","https://openalex.org/W2971681342","https://openalex.org/W2979826702","https://openalex.org/W3005680577","https://openalex.org/W3013008430","https://openalex.org/W3014616325","https://openalex.org/W3014705052","https://openalex.org/W3082424964","https://openalex.org/W3094328550","https://openalex.org/W3103177583","https://openalex.org/W3123375411","https://openalex.org/W3145728363","https://openalex.org/W3174637548","https://openalex.org/W3175201267","https://openalex.org/W3196904276","https://openalex.org/W4205922070","https://openalex.org/W4213009331","https://openalex.org/W4288253152","https://openalex.org/W4289533971","https://openalex.org/W4303449483","https://openalex.org/W4321448337","https://openalex.org/W6753529518","https://openalex.org/W6788399295"],"related_works":["https://openalex.org/W3024364549","https://openalex.org/W4206019083","https://openalex.org/W1976265003","https://openalex.org/W2054476758","https://openalex.org/W2370378377","https://openalex.org/W2048865712","https://openalex.org/W4210535024","https://openalex.org/W4237510188","https://openalex.org/W3123448197","https://openalex.org/W1510114644"],"abstract_inverted_index":{"Dataset":[0],"discovery":[1,23],"from":[2,24,48],"data":[3,25,185],"lakes":[4,26],"is":[5,141],"essential":[6],"in":[7,52,126,135],"many":[8],"real":[9,116],"application":[10],"scenarios.":[11],"In":[12],"this":[13],"paper,":[14],"we":[15],"propose":[16,92],"Starmie,":[17],"an":[18,178],"end-to-end":[19],"framework":[20,38,95],"for":[21,184],"dataset":[22],"(with":[27],"table":[28,117,130,158],"union":[29,131,159],"search":[30,132,160],"as":[31,86],"the":[32,63,79,87,106,123,127,142,146,168],"main":[33],"use":[34],"case).":[35],"Our":[36],"proposed":[37],"features":[39],"a":[40,53,72,93,99,163,173],"contrastive":[41,73],"learning":[42],"method":[43],"to":[44,104,144,153],"train":[45],"column":[46,58,83,88],"encoders":[47],"pre-trained":[49],"language":[50],"models":[51],"fully":[54],"unsupervised":[55],"manner.":[56],"The":[57],"encoder":[59],"of":[60,101,129,157],"Starmie":[61,121,140],"captures":[62],"rich":[64],"contextual":[65],"semantic":[66],"information":[67],"within":[68],"tables":[69,111],"by":[70,133],"leveraging":[71],"multi-column":[74],"pre-training":[75],"strategy.":[76],"We":[77],"utilize":[78],"cosine":[80],"similarity":[81],"between":[82,109],"embedding":[84],"vectors":[85],"unionability":[89,107],"score":[90,108],"and":[91,137,172],"filter-and-verification":[94],"that":[96,120],"allows":[97],"exploring":[98],"variety":[100],"design":[102],"choices":[103],"compute":[105],"two":[110],"accordingly.":[112],"Empirical":[113],"results":[114],"on":[115],"benchmarks":[118],"show":[119],"outperforms":[122],"best-known":[124],"solutions":[125],"effectiveness":[128],"6.8":[134],"MAP":[136],"recall.":[138],"Moreover,":[139],"first":[143],"employ":[145],"HNSW":[147],"(Hierarchical":[148],"Navigable":[149],"Small":[150],"World)":[151],"index":[152,180],"accelerate":[154],"query":[155],"processing":[156],"which":[161],"provides":[162],"3,000X":[164],"performance":[165,175],"gain":[166,176],"over":[167,177],"linear":[169],"scan":[170],"baseline":[171],"400X":[174],"LSH":[179],"(the":[181],"state-of-the-art":[182],"solution":[183],"lake":[186],"indexing).":[187]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":31},{"year":2024,"cited_by_count":18},{"year":2023,"cited_by_count":9}],"updated_date":"2026-04-02T15:55:50.835912","created_date":"2025-10-10T00:00:00"}
