{"id":"https://openalex.org/W4379057097","doi":"https://doi.org/10.48550/arxiv.2305.19629","title":"Measuring and Predicting the Quality of a Join for Data Discovery","display_name":"Measuring and Predicting the Quality of a Join for Data Discovery","publication_year":2023,"publication_date":"2023-05-31","ids":{"openalex":"https://openalex.org/W4379057097","doi":"https://doi.org/10.48550/arxiv.2305.19629"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2305.19629","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.19629","pdf_url":"https://arxiv.org/pdf/2305.19629","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2305.19629","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013379001","display_name":"Sergi Nadal","orcid":"https://orcid.org/0000-0002-8565-952X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Nadal, Sergi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112225809","display_name":"Raquel Panadero","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Panadero, Raquel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034865842","display_name":"Javier Gil Flores","orcid":"https://orcid.org/0000-0003-0755-4367"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Flores, Javier","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5025851055","display_name":"Oscar E Romero","orcid":"https://orcid.org/0000-0003-0209-3258"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Romero, Oscar","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5013379001"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9915000200271606,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.982200026512146,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/join","display_name":"Join (topology)","score":0.8163855075836182},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.77101731300354},{"id":"https://openalex.org/keywords/cardinality","display_name":"Cardinality (data modeling)","score":0.7528167963027954},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.6207709312438965},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5971840023994446},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.5444194078445435},{"id":"https://openalex.org/keywords/hash-function","display_name":"Hash function","score":0.5410361289978027},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.5365424752235413},{"id":"https://openalex.org/keywords/contrast","display_name":"Contrast (vision)","score":0.5307051539421082},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.5093918442726135},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.3697783648967743},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.26722848415374756},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.1282300353050232},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.062230437994003296}],"concepts":[{"id":"https://openalex.org/C2776124973","wikidata":"https://www.wikidata.org/wiki/Q3183033","display_name":"Join (topology)","level":2,"score":0.8163855075836182},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.77101731300354},{"id":"https://openalex.org/C87117476","wikidata":"https://www.wikidata.org/wiki/Q362383","display_name":"Cardinality (data modeling)","level":2,"score":0.7528167963027954},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.6207709312438965},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5971840023994446},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.5444194078445435},{"id":"https://openalex.org/C99138194","wikidata":"https://www.wikidata.org/wiki/Q183427","display_name":"Hash function","level":2,"score":0.5410361289978027},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.5365424752235413},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.5307051539421082},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.5093918442726135},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3697783648967743},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.26722848415374756},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1282300353050232},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.062230437994003296},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2305.19629","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.19629","pdf_url":"https://arxiv.org/pdf/2305.19629","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2305.19629","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2305.19629","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2305.19629","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.19629","pdf_url":"https://arxiv.org/pdf/2305.19629","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1450986124","display_name":null,"funder_award_id":"FJC2020-045809-I","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"},{"id":"https://openalex.org/G1551405195","display_name":null,"funder_award_id":"PID2020-117191RB-I00","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"},{"id":"https://openalex.org/G1650125035","display_name":null,"funder_award_id":"B-I00","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"},{"id":"https://openalex.org/G2262748287","display_name":null,"funder_award_id":"501100011033","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G2660406951","display_name":null,"funder_award_id":"/ AEI/10","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"},{"id":"https://openalex.org/G3025668325","display_name":null,"funder_award_id":"RB-I00","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G3429648993","display_name":null,"funder_award_id":"PID202","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G3480869486","display_name":null,"funder_award_id":"13039","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G3681454997","display_name":null,"funder_award_id":"13039/501100011033","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"},{"id":"https://openalex.org/G4042783231","display_name":null,"funder_award_id":"501100011033","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"},{"id":"https://openalex.org/G4126322094","display_name":null,"funder_award_id":"01100011033","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G5400894464","display_name":null,"funder_award_id":"PID2020-117191RB-I00","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G5421974903","display_name":null,"funder_award_id":"501100011033","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"},{"id":"https://openalex.org/G5550858994","display_name":null,"funder_award_id":"AEI/10","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"},{"id":"https://openalex.org/G561707894","display_name":null,"funder_award_id":"FJC2020-045809-I","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"},{"id":"https://openalex.org/G5805502524","display_name":null,"funder_award_id":"PID2020","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"},{"id":"https://openalex.org/G6589649124","display_name":null,"funder_award_id":"PID2020-","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"},{"id":"https://openalex.org/G661330594","display_name":null,"funder_award_id":"00110","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G6685425346","display_name":null,"funder_award_id":"0011033","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G7084143925","display_name":null,"funder_award_id":"AEI/10","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G7266728691","display_name":null,"funder_award_id":"13039/501100011033","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"},{"id":"https://openalex.org/G8055432103","display_name":null,"funder_award_id":"13039/501100011033","funder_id":"https://openalex.org/F4320322930","funder_display_name":"Ministerio de Ciencia e Innovaci\u00f3n"},{"id":"https://openalex.org/G8260616629","display_name":null,"funder_award_id":"011033","funder_id":"https://openalex.org/F4320335598","funder_display_name":"Agencia Estatal de Investigaci\u00f3n"}],"funders":[{"id":"https://openalex.org/F4320320300","display_name":"European Commission","ror":"https://ror.org/00k4n6c32"},{"id":"https://openalex.org/F4320322930","display_name":"Ministerio de Ciencia e Innovaci\u00f3n","ror":"https://ror.org/034900433"},{"id":"https://openalex.org/F4320335598","display_name":"Agencia Estatal de Investigaci\u00f3n","ror":null}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4379057097.pdf","grobid_xml":"https://content.openalex.org/works/W4379057097.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4205996836","https://openalex.org/W2151692181","https://openalex.org/W4392498349","https://openalex.org/W2093960938","https://openalex.org/W3214148052","https://openalex.org/W4392216655","https://openalex.org/W2807741550","https://openalex.org/W794462722","https://openalex.org/W2029625042","https://openalex.org/W3101703006"],"abstract_inverted_index":{"We":[0,10,98],"study":[1],"the":[2,12,27,31,55,72,89,112],"problem":[3,13],"of":[4,30,36,57,64,79,118,132,144],"discovering":[5],"joinable":[6],"datasets":[7],"at":[8],"scale.":[9],"approach":[11,101],"from":[14,66],"a":[15,44,58,62,76,85,103],"learning":[16],"perspective":[17],"relying":[18],"on":[19,84],"profiles.":[20],"These":[21],"are":[22,50,137],"succinct":[23],"representations":[24],"that":[25,82,124,131],"capture":[26],"underlying":[28],"characteristics":[29],"schemata":[32],"and":[33,46,91,107,115],"data":[34],"values":[35],"datasets,":[37],"which":[38],"can":[39],"be":[40],"efficiently":[41],"extracted":[42],"in":[43,102],"distributed":[45],"parallel":[47],"fashion.":[48],"Profiles":[49],"then":[51],"compared,":[52],"to":[53,71,110,130,139,141],"predict":[54],"quality":[56,81],"join":[59,80,95],"operation":[60],"among":[61],"pair":[63],"attributes":[65],"different":[67],"datasets.":[68],"In":[69],"contrast":[70],"state-of-the-art,":[73],"we":[74,136],"define":[75],"novel":[77],"notion":[78],"relies":[83],"metric":[86],"considering":[87],"both":[88],"containment":[90],"cardinality":[92],"proportion":[93],"between":[94],"candidate":[96],"attributes.":[97],"implement":[99],"our":[100,119],"system":[104],"called":[105],"NextiaJD,":[106],"present":[108],"experiments":[109,122],"show":[111,123],"predictive":[113,128],"performance":[114,129],"computational":[116],"efficiency":[117],"method.":[120],"Our":[121],"NextiaJD":[125],"obtains":[126],"greater":[127],"hash-based":[133],"methods":[134],"while":[135],"able":[138],"scale-up":[140],"larger":[142],"volumes":[143],"data.":[145]},"counts_by_year":[],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
