{"id":"https://openalex.org/W4413360639","doi":"https://doi.org/10.1109/icde65448.2025.00118","title":"TabSketchFM: Sketch-Based Tabular Representation Learning for Data Discovery Over Data Lakes","display_name":"TabSketchFM: Sketch-Based Tabular Representation Learning for Data Discovery Over Data Lakes","publication_year":2025,"publication_date":"2025-05-19","ids":{"openalex":"https://openalex.org/W4413360639","doi":"https://doi.org/10.1109/icde65448.2025.00118"},"language":"en","primary_location":{"id":"doi:10.1109/icde65448.2025.00118","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icde65448.2025.00118","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 41st International Conference on Data Engineering (ICDE)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5047161025","display_name":"Aamod Khatiwada","orcid":"https://orcid.org/0000-0001-5720-1207"},"institutions":[{"id":"https://openalex.org/I87182695","display_name":"Universidad del Noreste","ror":"https://ror.org/02ahky613","country_code":"MX","type":"education","lineage":["https://openalex.org/I87182695"]}],"countries":["MX"],"is_corresponding":false,"raw_author_name":"Aamod Khatiwada","raw_affiliation_strings":["Northeastern University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Northeastern University","institution_ids":["https://openalex.org/I87182695"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025268861","display_name":"Harsha Kokel","orcid":"https://orcid.org/0000-0002-7548-3719"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Harsha Kokel","raw_affiliation_strings":["IBM Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031546123","display_name":"Ibrahim Abdelaziz","orcid":"https://orcid.org/0000-0003-1449-5115"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ibrahim Abdelaziz","raw_affiliation_strings":["IBM Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000750466","display_name":"Subhajit Chaudhury","orcid":"https://orcid.org/0000-0003-3435-2584"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Subhajit Chaudhury","raw_affiliation_strings":["IBM Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012855077","display_name":"Julian Dolby","orcid":"https://orcid.org/0000-0002-6658-2678"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Julian Dolby","raw_affiliation_strings":["IBM Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068065546","display_name":"Oktie Hassanzadeh","orcid":"https://orcid.org/0000-0001-5307-9857"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Oktie Hassanzadeh","raw_affiliation_strings":["IBM Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050620631","display_name":"Zhenhan Huang","orcid":"https://orcid.org/0000-0002-9129-781X"},"institutions":[{"id":"https://openalex.org/I165799507","display_name":"Rensselaer Polytechnic Institute","ror":"https://ror.org/01rtyzb94","country_code":"US","type":"education","lineage":["https://openalex.org/I165799507"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhenhan Huang","raw_affiliation_strings":["Rensselaer Polytechnic Institute"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Rensselaer Polytechnic Institute","institution_ids":["https://openalex.org/I165799507"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066128057","display_name":"Tejaswini Pedapati","orcid":"https://orcid.org/0000-0002-5260-0951"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tejaswini Pedapati","raw_affiliation_strings":["IBM Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035277014","display_name":"Horst Samulowitz","orcid":"https://orcid.org/0000-0002-6780-3217"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Horst Samulowitz","raw_affiliation_strings":["IBM Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5085594669","display_name":"Kavitha Srinivas","orcid":"https://orcid.org/0000-0003-4610-967X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kavitha Srinivas","raw_affiliation_strings":["IBM Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Research","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.064,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.78987521,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1523","last_page":"1536"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12205","display_name":"Time Series Analysis and Forecasting","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12205","display_name":"Time Series Analysis and Forecasting","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9898999929428101,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12761","display_name":"Data Stream Mining Techniques","score":0.9843000173568726,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sketch","display_name":"Sketch","score":0.9250528812408447},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7419081926345825},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.6389681696891785},{"id":"https://openalex.org/keywords/external-data-representation","display_name":"External Data Representation","score":0.4805788993835449},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.42969226837158203},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.38023990392684937},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.3434709906578064},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3253013789653778},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.07081994414329529}],"concepts":[{"id":"https://openalex.org/C2779231336","wikidata":"https://www.wikidata.org/wiki/Q7534724","display_name":"Sketch","level":2,"score":0.9250528812408447},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7419081926345825},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.6389681696891785},{"id":"https://openalex.org/C116409475","wikidata":"https://www.wikidata.org/wiki/Q1385056","display_name":"External Data Representation","level":2,"score":0.4805788993835449},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42969226837158203},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.38023990392684937},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3434709906578064},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3253013789653778},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.07081994414329529},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icde65448.2025.00118","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icde65448.2025.00118","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 41st International Conference on Data Engineering (ICDE)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/13","display_name":"Climate action","score":0.47999998927116394}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":45,"referenced_works":["https://openalex.org/W1969621019","https://openalex.org/W2008896880","https://openalex.org/W2072883612","https://openalex.org/W2080133951","https://openalex.org/W2108223890","https://openalex.org/W2341748398","https://openalex.org/W2542998387","https://openalex.org/W2616147950","https://openalex.org/W2752618741","https://openalex.org/W2795089200","https://openalex.org/W2798649495","https://openalex.org/W2798664493","https://openalex.org/W2890431379","https://openalex.org/W2926805670","https://openalex.org/W2941366772","https://openalex.org/W2948163032","https://openalex.org/W2951621897","https://openalex.org/W2963469388","https://openalex.org/W2970992672","https://openalex.org/W3008881932","https://openalex.org/W3014616325","https://openalex.org/W3035140194","https://openalex.org/W3035231859","https://openalex.org/W3082424964","https://openalex.org/W3139909695","https://openalex.org/W3145728363","https://openalex.org/W3153032435","https://openalex.org/W3158303960","https://openalex.org/W3165753548","https://openalex.org/W3174181645","https://openalex.org/W3174637548","https://openalex.org/W4205922070","https://openalex.org/W4225683910","https://openalex.org/W4283383705","https://openalex.org/W4300601563","https://openalex.org/W4310390625","https://openalex.org/W4365456672","https://openalex.org/W4366327856","https://openalex.org/W4380433117","https://openalex.org/W4385653220","https://openalex.org/W4385893866","https://openalex.org/W4399208169","https://openalex.org/W4399208200","https://openalex.org/W4400532462","https://openalex.org/W4403577861"],"related_works":["https://openalex.org/W2378994405","https://openalex.org/W2917844847","https://openalex.org/W2036757537","https://openalex.org/W2759085743","https://openalex.org/W4282930045","https://openalex.org/W4238546310","https://openalex.org/W4241634354","https://openalex.org/W2021866862","https://openalex.org/W2376367779","https://openalex.org/W2553449828"],"abstract_inverted_index":{"Enterprises":[0],"have":[1],"a":[2,40,55,95,120,127],"growing":[3],"need":[4],"to":[5,58,99,114,151],"identify":[6],"relevant":[7],"tables":[8,13,125],"in":[9,65,126,145],"data":[10,31,45,48,63,175],"lakes;":[11],"e.g.":[12],"that":[14,129,134,164],"are":[15,103,130,135],"unionable,":[16,77,131],"joinable,":[17,78,132],"or":[18,133],"subsets":[19,136],"of":[20,62,137],"each":[21],"other.":[22],"Tabular":[23],"neural":[24,41,66,90],"models":[25,113],"can":[26,167],"be":[27],"help-ful":[28],"for":[29,44,75,105,148],"such":[30],"discovery":[32,46,64],"tasks.":[33,107],"In":[34],"this":[35],"paper,":[36],"we":[37,51,70,93,109,155],"present":[38,94],"TabSketchFM,":[39],"tabular":[42,67,89],"model":[43,74,166],"over":[47,87,173],"lakes.":[49,176],"First,":[50],"propose":[52],"novel":[53],"pre-training:":[54],"sketch-based":[56],"approach":[57],"enhance":[59],"the":[60,72,138],"effectiveness":[61],"models.":[68,91],"Second,":[69],"finetune":[71],"pretrained":[73],"identifying":[76],"and":[79,83,161,172],"subset":[80],"table":[81,116],"pairs":[82],"show":[84,156],"significant":[85,143,157],"improvement":[86],"previous":[88],"Third,":[92],"detailed":[96],"ablation":[97],"study":[98],"highlight":[100],"which":[101,106],"sketches":[102],"crucial":[104],"Fourth,":[108],"use":[110],"these":[111],"finetuned":[112],"perform":[115],"search;":[117],"i.e.,":[118],"given":[119],"query":[121],"table,":[122],"find":[123],"other":[124],"corpus":[128],"query.":[139],"Our":[140],"results":[141],"demonstrate":[142],"improvements":[144],"F1":[146],"scores":[147],"search":[149],"compared":[150],"state-of-the-art":[152],"techniques.":[153],"Finally,":[154],"transfer":[158],"across":[159,169],"datasets":[160],"tasks":[162,171],"establishing":[163],"our":[165],"generalize":[168],"different":[170,174]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
