{"id":"https://openalex.org/W4409982904","doi":"https://doi.org/10.1137/1.9781611978520.42","title":"Language Models are Explorers for Join Discovery on Data Lakes","display_name":"Language Models are Explorers for Join Discovery on Data Lakes","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4409982904","doi":"https://doi.org/10.1137/1.9781611978520.42"},"language":"en","primary_location":{"id":"doi:10.1137/1.9781611978520.42","is_oa":false,"landing_page_url":"https://doi.org/10.1137/1.9781611978520.42","pdf_url":null,"source":{"id":"https://openalex.org/S4306463922","display_name":"Society for Industrial and Applied Mathematics eBooks","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320508","host_organization_name":"Society for Industrial and Applied Mathematics","host_organization_lineage":["https://openalex.org/P4310320508"],"host_organization_lineage_names":["Society for Industrial and Applied Mathematics"],"type":"ebook platform"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 SIAM International Conference on Data Mining (SDM)","raw_type":"book-chapter"},"type":"book-chapter","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5070012760","display_name":"Yaohua Wang","orcid":"https://orcid.org/0009-0005-6211-6388"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yaohua Wang","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040297543","display_name":"Bolin Ding","orcid":"https://orcid.org/0000-0003-1535-9692"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]},{"id":"https://openalex.org/I4210108542","display_name":"Schlumberger (Ireland)","ror":"https://ror.org/01mcc7007","country_code":"IE","type":"company","lineage":["https://openalex.org/I4210092184","https://openalex.org/I4210108542"]}],"countries":["IE","US"],"is_corresponding":false,"raw_author_name":"Bolin Ding","raw_affiliation_strings":["Alibaba Group","The corresponding author"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]},{"raw_affiliation_string":"The corresponding author","institution_ids":["https://openalex.org/I4210108542"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100686442","display_name":"Rong Zhu","orcid":"https://orcid.org/0000-0003-4019-8587"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Rong Zhu","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100408836","display_name":"Haibin Wang","orcid":"https://orcid.org/0000-0002-9865-324X"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Haibin Wang","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039564129","display_name":"Zhijian Ma","orcid":"https://orcid.org/0009-0007-8674-5351"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhijian Ma","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5057864403","display_name":"Jingren Zhou","orcid":"https://orcid.org/0000-0002-4220-2634"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jingren Zhou","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5070012760"],"corresponding_institution_ids":["https://openalex.org/I4210095624"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18049973,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"398","last_page":"408"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.979200005531311,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.979200005531311,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9707000255584717,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12761","display_name":"Data Stream Mining Techniques","score":0.967199981212616,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/join","display_name":"Join (topology)","score":0.8777157068252563},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5157592296600342},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.35420745611190796},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.3390144407749176},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.3248186707496643},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.07932832837104797}],"concepts":[{"id":"https://openalex.org/C2776124973","wikidata":"https://www.wikidata.org/wiki/Q3183033","display_name":"Join (topology)","level":2,"score":0.8777157068252563},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5157592296600342},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.35420745611190796},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3390144407749176},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.3248186707496643},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.07932832837104797},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1137/1.9781611978520.42","is_oa":false,"landing_page_url":"https://doi.org/10.1137/1.9781611978520.42","pdf_url":null,"source":{"id":"https://openalex.org/S4306463922","display_name":"Society for Industrial and Applied Mathematics eBooks","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320508","host_organization_name":"Society for Industrial and Applied Mathematics","host_organization_lineage":["https://openalex.org/P4310320508"],"host_organization_lineage_names":["Society for Industrial and Applied Mathematics"],"type":"ebook platform"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 SIAM International Conference on Data Mining (SDM)","raw_type":"book-chapter"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4099999964237213,"display_name":"Climate action","id":"https://metadata.un.org/sdg/13"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W4205996836","https://openalex.org/W2151692181","https://openalex.org/W4392498349","https://openalex.org/W2093960938","https://openalex.org/W4407788647","https://openalex.org/W3214148052","https://openalex.org/W4392216655"],"abstract_inverted_index":{"Join":[0,159],"discovery":[1,175,226,276],"is":[2,20,116,169,281],"a":[3,55,157,245,252,288],"typical":[4],"task":[5,123],"in":[6,25,38,90,98,104,121],"data":[7,23,27,91,105,125,177,256],"lake":[8],"research.":[9],"It":[10,93,179],"finds":[11],"joinable":[12],"relationships":[13],"between":[14,133],"columns":[15],"of":[16,118,130,145,186,209,215,248,268,290,310],"different":[17],"tables":[18],"which":[19],"critical":[21],"for":[22,184,193,244,270],"integration":[24],"schema-less":[26],"lakes.":[28,178],"On":[29],"the":[30,48,79,119,122,128,134,142,187,190,199,213,262,278,308],"other":[31],"hand,":[32],"LLMs":[33,146,259,269],"have":[34],"achieved":[35],"promising":[36],"results":[37],"many":[39],"natural":[40],"language":[41,242],"tasks":[42],"recently.":[43],"They":[44],"behave":[45],"well":[46,302],"with":[47,162,173,258,285],"in-context":[49],"learning":[50],"ability":[51],"that":[52,64,95,231,296],"only":[53],"needs":[54],"few":[56],"examples":[57,183],"and":[58,127,147,164,189,241,260,264,283,287,303],"no":[59],"fine-tuning,":[60],"showing":[61,307],"great":[62],"advantages":[63],"can":[65,220],"be":[66,102,221],"applied":[67],"to":[68,78,84,88,112,140,171,223,227,254],"join":[69,174,225,271,275],"discovery.":[70,272],"However,":[71],"directly":[72],"applying":[73],"NLP":[74,99],"prompt":[75,188,200],"generation":[76],"methods":[77],"table":[80],"modality":[81],"may":[82,148],"lead":[83],"performance":[85],"decline":[86],"due":[87],"differences":[89,120],"modalities.":[92],"means":[94],"strategies":[96],"useful":[97],"scenarios":[100],"cannot":[101],"used":[103],"lakes":[106,257],"or":[107],"obtain":[108],"limited":[109],"efficacy,":[110],"leading":[111],"sub-optimal":[113],"prompts.":[114],"This":[115],"because":[117],"definition,":[124],"modalities":[126],"availability":[129],"labeled":[131],"samples":[132],"two":[135],"scenarios.":[136],"Unsuitable":[137],"prompts":[138],"fail":[139],"harness":[141],"full":[143],"potential":[144],"even":[149],"mislead":[150],"them":[151],"into":[152],"producing":[153],"incorrect":[154],"answers.":[155],"Therefore,":[156],"novel":[158],"Discovery":[160],"System":[161],"Comprehensive":[163],"Optimized":[165],"Prompt":[166],"Engineering":[167],"(JD-SCOPE)":[168],"proposed":[170],"deal":[172],"on":[176],"first":[180],"constructs":[181],"unsupervised":[182],"demonstrations":[185],"validation":[191],"set":[192],"hyper-parameters":[194],"tuning.":[195],"Then":[196],"it":[197],"explores":[198],"template":[201],"automatically.":[202],"Meanwhile,":[203],"JD-SCOPE":[204,219,250,297],"also":[205],"ensures":[206],"stable":[207],"outputs":[208],"LLMs,":[210],"thus":[211],"avoiding":[212],"impact":[214],"randomness.":[216],"In":[217],"addition,":[218],"extended":[222],"semantic":[224],"explore":[228],"column":[229],"pairs":[230],"are":[232],"domain":[233],"correlated":[234],"but":[235],"not":[236],"identical":[237],"(e.g.,":[238],"country":[239],"code":[240],"code)":[243],"comprehension":[246],"understanding":[247],"data.":[249],"paves":[251],"way":[253],"ground":[255],"harnesses":[261],"knowledge":[263],"logical":[265],"reasoning":[266],"power":[267],"To":[273],"evaluate":[274],"effectively,":[277],"JD-Lake":[279],"dataset":[280],"curated":[282],"benchmarked":[284],"baselines":[286],"series":[289],"popular":[291],"LLMs.":[292],"The":[293],"experiments":[294],"prove":[295],"alleviates":[298],"this":[299],"migration":[300],"problem":[301],"empirically":[304],"outperforms":[305],"alternatives,":[306],"superiority":[309],"JD-SCOPE.":[311]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
