{"id":"https://openalex.org/W4400373550","doi":"https://doi.org/10.48550/arxiv.2407.01619","title":"TabSketchFM: Sketch-based Tabular Representation Learning for Data Discovery over Data Lakes","display_name":"TabSketchFM: Sketch-based Tabular Representation Learning for Data Discovery over Data Lakes","publication_year":2024,"publication_date":"2024-06-28","ids":{"openalex":"https://openalex.org/W4400373550","doi":"https://doi.org/10.48550/arxiv.2407.01619"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2407.01619","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.01619","pdf_url":"https://arxiv.org/pdf/2407.01619","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2407.01619","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5047161025","display_name":"Aamod Khatiwada","orcid":"https://orcid.org/0000-0001-5720-1207"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Khatiwada, Aamod","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025268861","display_name":"Harsha Kokel","orcid":"https://orcid.org/0000-0002-7548-3719"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kokel, Harsha","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031546123","display_name":"Ibrahim Abdelaziz","orcid":"https://orcid.org/0000-0003-1449-5115"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abdelaziz, Ibrahim","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000750466","display_name":"Subhajit Chaudhury","orcid":"https://orcid.org/0000-0003-3435-2584"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chaudhury, Subhajit","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012855077","display_name":"Julian Dolby","orcid":"https://orcid.org/0000-0002-6658-2678"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dolby, Julian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068065546","display_name":"Oktie Hassanzadeh","orcid":"https://orcid.org/0000-0001-5307-9857"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hassanzadeh, Oktie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050620631","display_name":"Zhenhan Huang","orcid":"https://orcid.org/0000-0002-9129-781X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Zhenhan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066128057","display_name":"Tejaswini Pedapati","orcid":"https://orcid.org/0000-0002-5260-0951"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pedapati, Tejaswini","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035277014","display_name":"Horst Samulowitz","orcid":"https://orcid.org/0000-0002-6780-3217"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Samulowitz, Horst","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5085594669","display_name":"Kavitha Srinivas","orcid":"https://orcid.org/0000-0003-4610-967X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Srinivas, Kavitha","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5047161025"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12205","display_name":"Time Series Analysis and Forecasting","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12205","display_name":"Time Series Analysis and Forecasting","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9915000200271606,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12761","display_name":"Data Stream Mining Techniques","score":0.991100013256073,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sketch","display_name":"Sketch","score":0.88982093334198},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.6186431646347046},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5986055135726929},{"id":"https://openalex.org/keywords/external-data-representation","display_name":"External Data Representation","score":0.4505055248737335},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.38887614011764526},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.35970938205718994},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3374457359313965},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.32355645298957825},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.06771540641784668}],"concepts":[{"id":"https://openalex.org/C2779231336","wikidata":"https://www.wikidata.org/wiki/Q7534724","display_name":"Sketch","level":2,"score":0.88982093334198},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.6186431646347046},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5986055135726929},{"id":"https://openalex.org/C116409475","wikidata":"https://www.wikidata.org/wiki/Q1385056","display_name":"External Data Representation","level":2,"score":0.4505055248737335},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38887614011764526},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.35970938205718994},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3374457359313965},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.32355645298957825},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.06771540641784668},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2407.01619","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.01619","pdf_url":"https://arxiv.org/pdf/2407.01619","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2407.01619","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2407.01619","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2407.01619","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.01619","pdf_url":"https://arxiv.org/pdf/2407.01619","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4400373550.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2917844847","https://openalex.org/W2036757537","https://openalex.org/W2759085743","https://openalex.org/W2096451653","https://openalex.org/W4282930045","https://openalex.org/W4238546310","https://openalex.org/W4241634354","https://openalex.org/W2021866862","https://openalex.org/W2376367779","https://openalex.org/W2553449828"],"abstract_inverted_index":{"Enterprises":[0],"have":[1],"a":[2,40,55,95,120,127],"growing":[3],"need":[4],"to":[5,58,99,114,151],"identify":[6],"relevant":[7],"tables":[8,13,125],"in":[9,65,126,145],"data":[10,31,45,48,63,175],"lakes;":[11],"e.g.":[12],"that":[14,129,134,164],"are":[15,103,130,135],"unionable,":[16,77,131],"joinable,":[17,78,132],"or":[18,133],"subsets":[19,136],"of":[20,62,137],"each":[21],"other.":[22],"Tabular":[23],"neural":[24,41,66,90],"models":[25,113],"can":[26,167],"be":[27],"helpful":[28],"for":[29,44,75,105,148],"such":[30],"discovery":[32,46,64],"tasks.":[33,107],"In":[34],"this":[35],"paper,":[36],"we":[37,51,70,93,109,155],"present":[38,94],"TabSketchFM,":[39],"tabular":[42,67,89],"model":[43,74,166],"over":[47,87,173],"lakes.":[49,176],"First,":[50],"propose":[52],"novel":[53],"pre-training:":[54],"sketch-based":[56],"approach":[57],"enhance":[59],"the":[60,72,138],"effectiveness":[61],"models.":[68,91],"Second,":[69],"finetune":[71],"pretrained":[73],"identifying":[76],"and":[79,83,161,172],"subset":[80],"table":[81,116],"pairs":[82],"show":[84,156],"significant":[85,143,157],"improvement":[86],"previous":[88],"Third,":[92],"detailed":[96],"ablation":[97],"study":[98],"highlight":[100],"which":[101,106],"sketches":[102],"crucial":[104],"Fourth,":[108],"use":[110],"these":[111],"finetuned":[112],"perform":[115],"search;":[117],"i.e.,":[118],"given":[119],"query":[121],"table,":[122],"find":[123],"other":[124],"corpus":[128],"query.":[139],"Our":[140],"results":[141],"demonstrate":[142],"improvements":[144],"F1":[146],"scores":[147],"search":[149],"compared":[150],"state-of-the-art":[152],"techniques.":[153],"Finally,":[154],"transfer":[158],"across":[159,169],"datasets":[160],"tasks":[162,171],"establishing":[163],"our":[165],"generalize":[168],"different":[170,174]},"counts_by_year":[],"updated_date":"2026-03-13T16:22:10.518609","created_date":"2025-10-10T00:00:00"}
