{"id":"https://openalex.org/W4386123414","doi":"https://doi.org/10.14778/3611479.3611533","title":"Cross Modal Data Discovery over Structured and Unstructured Data Lakes","display_name":"Cross Modal Data Discovery over Structured and Unstructured Data Lakes","publication_year":2023,"publication_date":"2023-07-01","ids":{"openalex":"https://openalex.org/W4386123414","doi":"https://doi.org/10.14778/3611479.3611533"},"language":"en","primary_location":{"id":"doi:10.14778/3611479.3611533","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3611479.3611533","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5074322354","display_name":"Mohamed Y. Eltabakh","orcid":"https://orcid.org/0000-0002-6344-8246"},"institutions":[{"id":"https://openalex.org/I4210138380","display_name":"Qatar Cardiovascular Research Center","ror":"https://ror.org/038vyt185","country_code":"QA","type":"healthcare","lineage":["https://openalex.org/I4210138380"]}],"countries":["QA"],"is_corresponding":true,"raw_author_name":"Mohamed Y. Eltabakh","raw_affiliation_strings":["Qatar Computing Research Institute (QCRI), Doha, Qatar"],"affiliations":[{"raw_affiliation_string":"Qatar Computing Research Institute (QCRI), Doha, Qatar","institution_ids":["https://openalex.org/I4210138380"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058892091","display_name":"Mayuresh Kunjir","orcid":"https://orcid.org/0009-0006-6196-2184"},"institutions":[{"id":"https://openalex.org/I4210089985","display_name":"Amazon (Germany)","ror":"https://ror.org/00b9ktm87","country_code":"DE","type":"company","lineage":["https://openalex.org/I1311688040","https://openalex.org/I4210089985"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Mayuresh Kunjir","raw_affiliation_strings":["Amazon Web Services, Berlin, Germany"],"affiliations":[{"raw_affiliation_string":"Amazon Web Services, Berlin, Germany","institution_ids":["https://openalex.org/I4210089985"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089912733","display_name":"Ahmed K. Elmagarmid","orcid":"https://orcid.org/0000-0002-0044-458X"},"institutions":[{"id":"https://openalex.org/I4210138380","display_name":"Qatar Cardiovascular Research Center","ror":"https://ror.org/038vyt185","country_code":"QA","type":"healthcare","lineage":["https://openalex.org/I4210138380"]}],"countries":["QA"],"is_corresponding":false,"raw_author_name":"Ahmed K. Elmagarmid","raw_affiliation_strings":["Qatar Computing Research Institute (QCRI), Doha, Qatar"],"affiliations":[{"raw_affiliation_string":"Qatar Computing Research Institute (QCRI), Doha, Qatar","institution_ids":["https://openalex.org/I4210138380"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5009743322","display_name":"Mohammad Shahmeer Ahmad","orcid":null},"institutions":[{"id":"https://openalex.org/I4210138380","display_name":"Qatar Cardiovascular Research Center","ror":"https://ror.org/038vyt185","country_code":"QA","type":"healthcare","lineage":["https://openalex.org/I4210138380"]}],"countries":["QA"],"is_corresponding":false,"raw_author_name":"Mohammad Shahmeer Ahmad","raw_affiliation_strings":["Qatar Computing Research Institute (QCRI), Doha, Qatar"],"affiliations":[{"raw_affiliation_string":"Qatar Computing Research Institute (QCRI), Doha, Qatar","institution_ids":["https://openalex.org/I4210138380"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5074322354"],"corresponding_institution_ids":["https://openalex.org/I4210138380"],"apc_list":null,"apc_paid":null,"fwci":1.7831,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.85801187,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":"16","issue":"11","first_page":"3377","last_page":"3390"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12205","display_name":"Time Series Analysis and Forecasting","score":0.995199978351593,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8025916814804077},{"id":"https://openalex.org/keywords/unstructured-data","display_name":"Unstructured data","score":0.6623945832252502},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6210261583328247},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5949089527130127},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.5258196592330933},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5016427040100098},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.48765480518341064},{"id":"https://openalex.org/keywords/semi-structured-data","display_name":"Semi-structured data","score":0.47663912177085876},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.46053504943847656},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.43895289301872253},{"id":"https://openalex.org/keywords/knowledge-extraction","display_name":"Knowledge extraction","score":0.42936980724334717},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.41161012649536133},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.41070711612701416},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.28740811347961426},{"id":"https://openalex.org/keywords/relational-database","display_name":"Relational database","score":0.22690153121948242},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.20037272572517395},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.1684975028038025}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8025916814804077},{"id":"https://openalex.org/C2781252014","wikidata":"https://www.wikidata.org/wiki/Q1141900","display_name":"Unstructured data","level":3,"score":0.6623945832252502},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6210261583328247},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5949089527130127},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5258196592330933},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5016427040100098},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.48765480518341064},{"id":"https://openalex.org/C40077939","wikidata":"https://www.wikidata.org/wiki/Q2336004","display_name":"Semi-structured data","level":3,"score":0.47663912177085876},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.46053504943847656},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.43895289301872253},{"id":"https://openalex.org/C120567893","wikidata":"https://www.wikidata.org/wiki/Q1582085","display_name":"Knowledge extraction","level":2,"score":0.42936980724334717},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.41161012649536133},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.41070711612701416},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.28740811347961426},{"id":"https://openalex.org/C5655090","wikidata":"https://www.wikidata.org/wiki/Q192588","display_name":"Relational database","level":2,"score":0.22690153121948242},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.20037272572517395},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.1684975028038025},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.14778/3611479.3611533","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3611479.3611533","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W1980867644","https://openalex.org/W2020022499","https://openalex.org/W2083619093","https://openalex.org/W2096733369","https://openalex.org/W2323103108","https://openalex.org/W2394595640","https://openalex.org/W2427822648","https://openalex.org/W2493916176","https://openalex.org/W2616147950","https://openalex.org/W2795089200","https://openalex.org/W2798664493","https://openalex.org/W2915623326","https://openalex.org/W2924309908","https://openalex.org/W2948163032","https://openalex.org/W2950817225","https://openalex.org/W2951621897","https://openalex.org/W2953502323","https://openalex.org/W2963174348","https://openalex.org/W2963265099","https://openalex.org/W2963502184","https://openalex.org/W2970992672","https://openalex.org/W3008672977","https://openalex.org/W3013008430","https://openalex.org/W3014616325","https://openalex.org/W3031580546","https://openalex.org/W3082424964","https://openalex.org/W3088189770","https://openalex.org/W3106224367","https://openalex.org/W3109684201","https://openalex.org/W3138890951","https://openalex.org/W3174637548","https://openalex.org/W3180107114","https://openalex.org/W4205922070","https://openalex.org/W4252076394"],"related_works":["https://openalex.org/W2019158987","https://openalex.org/W2142354878","https://openalex.org/W2034595671","https://openalex.org/W2281126075","https://openalex.org/W2942479669","https://openalex.org/W2405464607","https://openalex.org/W3034384113","https://openalex.org/W2044775339","https://openalex.org/W1622528090","https://openalex.org/W4327649155"],"abstract_inverted_index":{"Organizations":[0],"are":[1,14,45,277],"collecting":[2],"increasingly":[3],"large":[4],"amounts":[5],"of":[6,26,28,39,71,165],"data":[7,13,23,58,99,138,151,159,235,267],"for":[8,90,115,143,249],"data-driven":[9],"decision":[10],"making.":[11],"These":[12],"often":[15],"dumped":[16],"into":[17],"a":[18,22,48,91,112,136,168,192],"centralized":[19],"repository,":[20],"e.g.,":[21],"lake,":[24],"consisting":[25],"thousands":[27],"structured":[29,78,156,282],"and":[30,75,79,109,120,157,204,219,226,240,263,269],"unstructured":[31,80,158],"datasets.":[32,283],"Perversely,":[33],"such":[34],"mixture":[35],"makes":[36],"the":[37,54,61,68,103,128,150,162,172,183,187,199,221,254,273,281],"problem":[38,62,93],"discovering":[40,73],"tables":[41],"or":[42,98,101],"documents":[43,203],"that":[44,177,197,242,276],"relevant":[46],"to":[47,126,175,180,253,265,272,279],"user's":[49],"query":[50],"very":[51],"challenging.":[52],"Despite":[53],"recent":[55],"efforts":[56],"in":[57,67,122],"discovery":[59,129,139,152,184,251],",":[60,218],"remains":[63],"widely":[64],"open":[65],"especially":[66],"two":[69,146,188],"fronts":[70],"(1)":[72],"relationships":[74],"relatedness":[76],"across":[77,186],"datasets-where":[81],"existing":[82],"techniques":[83],"suffer":[84],"from":[85],"either":[86],"scalability,":[87],"being":[88],"customized":[89],"specific":[92],"type":[94],"(e.g.,":[95],"entity":[96],"matching":[97],"integration),":[100],"demolishing":[102],"structural":[104,163],"properties":[105,164],"on":[106,211,232],"its":[107],"way,":[108],"(2)":[110],"developing":[111],"holistic":[113],"system":[114,174,222,244],"integrating":[116],"various":[117],"similarity":[118],"measurements":[119],"sketches":[121],"an":[123],"effective":[124,248],"way":[125],"boost":[127],"accuracy.":[130],"In":[131],"this":[132],"paper,":[133],"we":[134],"propose":[135,191],"new":[137],"system,":[140],"named":[141],"CMDL,":[142],"addressing":[144],"these":[145],"limitations.":[147],"CMDL":[148,170,231,259],"supports":[149],"process":[153],"over":[154],"both":[155],"while":[160],"retaining":[161],"tables.":[166],"As":[167],"result,":[169],"is":[171,223,245,260],"only":[173,280],"date":[176],"empowers":[178],"end-users":[179],"seamlessly":[181],"pipeline":[182],"tasks":[185],"modalities.":[189],"We":[190,229],"novel":[193],"multi-modal":[194],"embedding":[195],"representation":[196],"captures":[198],"similarities":[200],"between":[201],"text":[202],"tabular":[205],"columns.":[206],"The":[207],"model":[208],"training":[209],"relies":[210],"labeled":[212],"datasets":[213],"generated":[214],"though":[215],"weak":[216],"supervision":[217],"thus":[220],"domain":[224],"agnostic":[225],"easily":[227],"generalizable.":[228],"evaluate":[230],"three":[233],"real-world":[234],"lakes":[236],"with":[237],"diverse":[238],"applications":[239],"show":[241],"our":[243],"significantly":[246],"more":[247,261],"cross-modality":[250],"compared":[252,271],"search-based":[255],"baseline":[256],"techniques.":[257],"Moreover,":[258],"accurate":[262],"robust":[264],"different":[266],"types":[268],"distributions":[270],"state-of-the-art":[274],"systems":[275],"limited":[278]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":5}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
