{"id":"https://openalex.org/W4385653220","doi":"https://doi.org/10.14778/3603581.3603587","title":"DeepJoin: Joinable Table Discovery with Pre-Trained Language Models","display_name":"DeepJoin: Joinable Table Discovery with Pre-Trained Language Models","publication_year":2023,"publication_date":"2023-06-01","ids":{"openalex":"https://openalex.org/W4385653220","doi":"https://doi.org/10.14778/3603581.3603587"},"language":"en","primary_location":{"id":"doi:10.14778/3603581.3603587","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3603581.3603587","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100518773","display_name":"Yuyang Dong","orcid":"https://orcid.org/0000-0001-7112-5212"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yuyang Dong","raw_affiliation_strings":["NEC Corporation"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NEC Corporation","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036148682","display_name":"Chuan Xiao","orcid":"https://orcid.org/0000-0001-7239-5134"},"institutions":[{"id":"https://openalex.org/I60134161","display_name":"Nagoya University","ror":"https://ror.org/04chrp450","country_code":"JP","type":"education","lineage":["https://openalex.org/I60134161"]},{"id":"https://openalex.org/I98285908","display_name":"The University of Osaka","ror":"https://ror.org/035t8zc32","country_code":"JP","type":"education","lineage":["https://openalex.org/I98285908"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Chuan Xiao","raw_affiliation_strings":["Osaka University and Nagoya University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Osaka University and Nagoya University","institution_ids":["https://openalex.org/I60134161","https://openalex.org/I98285908"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068486601","display_name":"Takuma Nozawa","orcid":"https://orcid.org/0000-0003-4077-3748"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Takuma Nozawa","raw_affiliation_strings":["NEC Corporation"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NEC Corporation","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081798715","display_name":"Masafumi Enomoto","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Masafumi Enomoto","raw_affiliation_strings":["NEC Corporation"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NEC Corporation","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5044722101","display_name":"Masafumi Oyamada","orcid":"https://orcid.org/0000-0002-4045-7350"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Masafumi Oyamada","raw_affiliation_strings":["NEC Corporation"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NEC Corporation","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100518773"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":10.0946,"has_fulltext":false,"cited_by_count":42,"citation_normalized_percentile":{"value":0.98452851,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":"16","issue":"10","first_page":"2458","last_page":"2470"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9943000078201294,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.791415810585022},{"id":"https://openalex.org/keywords/joins","display_name":"Joins","score":0.73194420337677},{"id":"https://openalex.org/keywords/column","display_name":"Column (typography)","score":0.7026075720787048},{"id":"https://openalex.org/keywords/table","display_name":"Table (database)","score":0.6987622380256653},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5462630987167358},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.4887115955352783},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.47121256589889526},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.4552704989910126},{"id":"https://openalex.org/keywords/contextualization","display_name":"Contextualization","score":0.4552045166492462},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.44655364751815796},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3894560635089874},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.3605600595474243},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.31729674339294434},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.10622453689575195},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.10054886341094971}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.791415810585022},{"id":"https://openalex.org/C2778692605","wikidata":"https://www.wikidata.org/wiki/Q4041866","display_name":"Joins","level":2,"score":0.73194420337677},{"id":"https://openalex.org/C2780551164","wikidata":"https://www.wikidata.org/wiki/Q2306599","display_name":"Column (typography)","level":3,"score":0.7026075720787048},{"id":"https://openalex.org/C45235069","wikidata":"https://www.wikidata.org/wiki/Q278425","display_name":"Table (database)","level":2,"score":0.6987622380256653},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5462630987167358},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.4887115955352783},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.47121256589889526},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.4552704989910126},{"id":"https://openalex.org/C2780712339","wikidata":"https://www.wikidata.org/wiki/Q5165204","display_name":"Contextualization","level":3,"score":0.4552045166492462},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.44655364751815796},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3894560635089874},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3605600595474243},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.31729674339294434},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.10622453689575195},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.10054886341094971},{"id":"https://openalex.org/C54355233","wikidata":"https://www.wikidata.org/wiki/Q7162","display_name":"Genetics","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C527412718","wikidata":"https://www.wikidata.org/wiki/Q855395","display_name":"Interpretation (philosophy)","level":2,"score":0.0},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.14778/3603581.3603587","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3603581.3603587","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":56,"referenced_works":["https://openalex.org/W2065259291","https://openalex.org/W2082179583","https://openalex.org/W2121516976","https://openalex.org/W2124509324","https://openalex.org/W2153579005","https://openalex.org/W2260484439","https://openalex.org/W2398606196","https://openalex.org/W2512971201","https://openalex.org/W2606791715","https://openalex.org/W2611029872","https://openalex.org/W2616147950","https://openalex.org/W2724806710","https://openalex.org/W2798664493","https://openalex.org/W2896457183","https://openalex.org/W2954922414","https://openalex.org/W2963174348","https://openalex.org/W2963469388","https://openalex.org/W2970641574","https://openalex.org/W2974875810","https://openalex.org/W2978017171","https://openalex.org/W3003937156","https://openalex.org/W3014265582","https://openalex.org/W3014616325","https://openalex.org/W3014705052","https://openalex.org/W3015883388","https://openalex.org/W3016473712","https://openalex.org/W3025624935","https://openalex.org/W3026889466","https://openalex.org/W3031051334","https://openalex.org/W3032215537","https://openalex.org/W3037852608","https://openalex.org/W3082424964","https://openalex.org/W3093315172","https://openalex.org/W3099965312","https://openalex.org/W3102264439","https://openalex.org/W3123375411","https://openalex.org/W3152438478","https://openalex.org/W3155200342","https://openalex.org/W3157891451","https://openalex.org/W3164968002","https://openalex.org/W3165814564","https://openalex.org/W3174637548","https://openalex.org/W3195079734","https://openalex.org/W4205922070","https://openalex.org/W4221143046","https://openalex.org/W4285335450","https://openalex.org/W4321448337","https://openalex.org/W4321448364","https://openalex.org/W6639274278","https://openalex.org/W6679663036","https://openalex.org/W6739901393","https://openalex.org/W6785231695","https://openalex.org/W6794246295","https://openalex.org/W6810851977","https://openalex.org/W6894243490","https://openalex.org/W6948093239"],"related_works":["https://openalex.org/W1687432146","https://openalex.org/W1591874556","https://openalex.org/W3216994056","https://openalex.org/W2185608106","https://openalex.org/W3046258185","https://openalex.org/W1548083239","https://openalex.org/W2268232908","https://openalex.org/W205610463","https://openalex.org/W2893240344","https://openalex.org/W4385272430"],"abstract_inverted_index":{"Due":[0],"to":[1,48,141,145,157,160,167,174,253,273,291],"the":[2,27,64,152,178,182,185,191,198,209,215,220,224],"usefulness":[3],"in":[4,19,63,177,189,214],"data":[5,8,20,229,233],"enrichment":[6],"for":[7,34,88,127,226],"analysis":[9],"tasks,":[10],"joinable":[11,92,169],"table":[12,71,93],"discovery":[13],"has":[14],"become":[15],"an":[16,98,270],"important":[17],"operation":[18],"lake":[21],"management.":[22],"Existing":[23],"approaches":[24],"target":[25,70],"equi-joins,":[26],"most":[28],"common":[29],"way":[30],"of":[31,66,138,184,197,248,294],"combining":[32],"tables":[33],"creating":[35],"a":[36,84,103,120,136,146,202,245,249,286],"unified":[37],"view,":[38],"or":[39,73],"semantic":[40,118,274],"joins,":[41],"which":[42,101],"tolerate":[43],"misspellings":[44],"and":[45,69,90,108,117,154,256],"different":[46],"formats":[47],"deliver":[49],"more":[50,267],"join":[51],"results.":[52],"They":[53],"are":[54,165,172],"either":[55],"exact":[56,271],"solutions":[57,75],"whose":[58],"running":[59],"time":[60,211],"is":[61,97,109,155,187,212,265,289],"linear":[62],"sizes":[65],"query":[67],"column":[68,143,199],"repository,":[72],"approximate":[74,204,262],"lacking":[76],"precision.":[77],"In":[78],"this":[79],"paper,":[80],"we":[81,222],"propose":[82,135],"DeepJoin,":[83],"deep":[85],"learning":[86],"model":[87,106],"accurate":[89,268],"efficient":[91],"discovery.":[94],"Our":[95],"solution":[96,272],"embedding-based":[99],"retrieval,":[100],"employs":[102],"pre-trained":[104],"language":[105],"(PLM)":[107],"designed":[110],"as":[111,230,232],"one":[112],"framework":[113],"serving":[114],"both":[115],"equi-":[116],"(with":[119],"similarity":[121],"condition":[122],"on":[123,237,244],"word":[124],"embeddings)":[125],"joins":[126,275],"textual":[128],"attributes":[129],"with":[130,278,285],"fairly":[131],"small":[132,246],"cardinalities.":[133],"We":[134],"set":[137],"contextualization":[139],"options":[140],"transform":[142],"contents":[144],"text":[147],"sequence.":[148],"The":[149,235],"PLM":[150,186],"reads":[151],"sequence":[153],"fine-tuned":[156],"embed":[158],"columns":[159,164],"vectors":[161],"such":[162],"that":[163,241],"expected":[166],"be":[168],"if":[170],"they":[171],"close":[173],"each":[175],"other":[176,261],"vector":[179],"space.":[180],"Since":[181],"output":[183],"fixed":[188],"length,":[190],"subsequent":[192],"search":[193,207,210],"procedure":[194],"becomes":[195],"independent":[196],"size.":[200,217],"With":[201],"state-of-the-art":[203],"nearest":[205],"neighbor":[206],"algorithm,":[208],"sublinear":[213],"repository":[216],"To":[218],"train":[219],"model,":[221],"devise":[223],"techniques":[225],"preparing":[227],"training":[228,243],"well":[231],"augmentation.":[234],"experiments":[236],"real":[238],"datasets":[239,255],"demonstrate":[240],"by":[242],"subset":[247],"corpus,":[250],"DeepJoin":[251,264,288],"generalizes":[252],"large":[254],"its":[257],"precision":[258],"consistently":[259],"outperforms":[260],"solutions'.":[263],"even":[266],"than":[269,297],"when":[276,283],"evaluated":[277],"labels":[279],"from":[280],"experts.":[281],"Moreover,":[282],"equipped":[284],"GPU,":[287],"up":[290],"two":[292],"orders":[293],"magnitude":[295],"faster":[296],"existing":[298],"solutions.":[299]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":27},{"year":2024,"cited_by_count":10},{"year":2023,"cited_by_count":2}],"updated_date":"2026-05-21T09:19:25.381259","created_date":"2025-10-10T00:00:00"}
