{"id":"https://openalex.org/W4206953594","doi":"https://doi.org/10.1109/ssci50451.2021.9659897","title":"Near duplicate column identification: a machine learning approach","display_name":"Near duplicate column identification: a machine learning approach","publication_year":2021,"publication_date":"2021-12-05","ids":{"openalex":"https://openalex.org/W4206953594","doi":"https://doi.org/10.1109/ssci50451.2021.9659897"},"language":"en","primary_location":{"id":"doi:10.1109/ssci50451.2021.9659897","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ssci50451.2021.9659897","pdf_url":null,"source":{"id":"https://openalex.org/S4363604921","display_name":"2021 IEEE Symposium Series on Computational Intelligence (SSCI)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Symposium Series on Computational Intelligence (SSCI)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5038326539","display_name":"Marc Chevallier","orcid":"https://orcid.org/0000-0002-7983-6147"},"institutions":[{"id":"https://openalex.org/I4210156583","display_name":"Laboratoire d'Informatique de Paris-Nord","ror":"https://ror.org/05g1zjw44","country_code":"FR","type":"facility","lineage":["https://openalex.org/I1294671590","https://openalex.org/I1294671590","https://openalex.org/I4210091279","https://openalex.org/I4210156583","https://openalex.org/I4210159245"]},{"id":"https://openalex.org/I4210091279","display_name":"Universit\u00e9 Sorbonne Paris Nord","ror":"https://ror.org/0199hds37","country_code":"FR","type":"education","lineage":["https://openalex.org/I4210091279"]}],"countries":["FR"],"is_corresponding":true,"raw_author_name":"Marc Chevallier","raw_affiliation_strings":["LIPN Laboratory Sorbonne Paris Nord University, Villetaneuse, France","LIPN - Laboratoire d'Informatique de Paris-Nord (Institut Galil\u00e9e, Universit\u00e9 Paris 13, 99 avenue Jean-Baptiste Cl\u00e9ment, F-93430, Villetaneuse - France)"],"affiliations":[{"raw_affiliation_string":"LIPN Laboratory Sorbonne Paris Nord University, Villetaneuse, France","institution_ids":["https://openalex.org/I4210156583","https://openalex.org/I4210091279"]},{"raw_affiliation_string":"LIPN - Laboratoire d'Informatique de Paris-Nord (Institut Galil\u00e9e, Universit\u00e9 Paris 13, 99 avenue Jean-Baptiste Cl\u00e9ment, F-93430, Villetaneuse - France)","institution_ids":["https://openalex.org/I4210091279","https://openalex.org/I4210156583"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023463946","display_name":"Faouzi Boufar\u00e8s","orcid":null},"institutions":[{"id":"https://openalex.org/I4210156583","display_name":"Laboratoire d'Informatique de Paris-Nord","ror":"https://ror.org/05g1zjw44","country_code":"FR","type":"facility","lineage":["https://openalex.org/I1294671590","https://openalex.org/I1294671590","https://openalex.org/I4210091279","https://openalex.org/I4210156583","https://openalex.org/I4210159245"]},{"id":"https://openalex.org/I4210091279","display_name":"Universit\u00e9 Sorbonne Paris Nord","ror":"https://ror.org/0199hds37","country_code":"FR","type":"education","lineage":["https://openalex.org/I4210091279"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Faouzi Boufares","raw_affiliation_strings":["LIPN Laboratory Sorbonne Paris Nord University, Villetaneuse, France","LIPN - Laboratoire d'Informatique de Paris-Nord (Institut Galil\u00e9e, Universit\u00e9 Paris 13, 99 avenue Jean-Baptiste Cl\u00e9ment, F-93430, Villetaneuse - France)"],"affiliations":[{"raw_affiliation_string":"LIPN Laboratory Sorbonne Paris Nord University, Villetaneuse, France","institution_ids":["https://openalex.org/I4210156583","https://openalex.org/I4210091279"]},{"raw_affiliation_string":"LIPN - Laboratoire d'Informatique de Paris-Nord (Institut Galil\u00e9e, Universit\u00e9 Paris 13, 99 avenue Jean-Baptiste Cl\u00e9ment, F-93430, Villetaneuse - France)","institution_ids":["https://openalex.org/I4210091279","https://openalex.org/I4210156583"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007160877","display_name":"Nistor Grozavu","orcid":"https://orcid.org/0000-0001-7502-8022"},"institutions":[{"id":"https://openalex.org/I4210156583","display_name":"Laboratoire d'Informatique de Paris-Nord","ror":"https://ror.org/05g1zjw44","country_code":"FR","type":"facility","lineage":["https://openalex.org/I1294671590","https://openalex.org/I1294671590","https://openalex.org/I4210091279","https://openalex.org/I4210156583","https://openalex.org/I4210159245"]},{"id":"https://openalex.org/I4210091279","display_name":"Universit\u00e9 Sorbonne Paris Nord","ror":"https://ror.org/0199hds37","country_code":"FR","type":"education","lineage":["https://openalex.org/I4210091279"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Nistor Grozavu","raw_affiliation_strings":["LIPN Laboratory Sorbonne Paris Nord University, Villetaneuse, France","LIPN - Laboratoire d'Informatique de Paris-Nord (Institut Galil\u00e9e, Universit\u00e9 Paris 13, 99 avenue Jean-Baptiste Cl\u00e9ment, F-93430, Villetaneuse - France)"],"affiliations":[{"raw_affiliation_string":"LIPN Laboratory Sorbonne Paris Nord University, Villetaneuse, France","institution_ids":["https://openalex.org/I4210156583","https://openalex.org/I4210091279"]},{"raw_affiliation_string":"LIPN - Laboratoire d'Informatique de Paris-Nord (Institut Galil\u00e9e, Universit\u00e9 Paris 13, 99 avenue Jean-Baptiste Cl\u00e9ment, F-93430, Villetaneuse - France)","institution_ids":["https://openalex.org/I4210091279","https://openalex.org/I4210156583"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046439373","display_name":"Nicoleta Rogovschi","orcid":null},"institutions":[{"id":"https://openalex.org/I4210156583","display_name":"Laboratoire d'Informatique de Paris-Nord","ror":"https://ror.org/05g1zjw44","country_code":"FR","type":"facility","lineage":["https://openalex.org/I1294671590","https://openalex.org/I1294671590","https://openalex.org/I4210091279","https://openalex.org/I4210156583","https://openalex.org/I4210159245"]},{"id":"https://openalex.org/I4210091279","display_name":"Universit\u00e9 Sorbonne Paris Nord","ror":"https://ror.org/0199hds37","country_code":"FR","type":"education","lineage":["https://openalex.org/I4210091279"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Nicoleta Rogovschi","raw_affiliation_strings":["LIPN Laboratory Sorbonne Paris Nord University, Villetaneuse, France","LIPN - Laboratoire d'Informatique de Paris-Nord (Institut Galil\u00e9e, Universit\u00e9 Paris 13, 99 avenue Jean-Baptiste Cl\u00e9ment, F-93430, Villetaneuse - France)"],"affiliations":[{"raw_affiliation_string":"LIPN Laboratory Sorbonne Paris Nord University, Villetaneuse, France","institution_ids":["https://openalex.org/I4210156583","https://openalex.org/I4210091279"]},{"raw_affiliation_string":"LIPN - Laboratoire d'Informatique de Paris-Nord (Institut Galil\u00e9e, Universit\u00e9 Paris 13, 99 avenue Jean-Baptiste Cl\u00e9ment, F-93430, Villetaneuse - France)","institution_ids":["https://openalex.org/I4210091279","https://openalex.org/I4210156583"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5028202000","display_name":"Charly Clairmont","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Charly Clairmont","raw_affiliation_strings":["Synaltic, Vincennes, France","SYNALTIC - Synaltic (24 Rue de l'\u00c9glise, 94300 Vincennes - France)"],"affiliations":[{"raw_affiliation_string":"Synaltic, Vincennes, France","institution_ids":[]},{"raw_affiliation_string":"SYNALTIC - Synaltic (24 Rue de l'\u00c9glise, 94300 Vincennes - France)","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5038326539"],"corresponding_institution_ids":["https://openalex.org/I4210091279","https://openalex.org/I4210156583"],"apc_list":null,"apc_paid":null,"fwci":1.2665,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.79187817,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":"9","issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11871","display_name":"Advanced Statistical Methods and Models","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/2613","display_name":"Statistics and Probability"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11871","display_name":"Advanced Statistical Methods and Models","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/2613","display_name":"Statistics and Probability"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9872999787330627,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11443","display_name":"Advanced Statistical Process Monitoring","score":0.984000027179718,"subfield":{"id":"https://openalex.org/subfields/1804","display_name":"Statistics, Probability and Uncertainty"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/column","display_name":"Column (typography)","score":0.8518080711364746},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7492492198944092},{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.6675742268562317},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6024661660194397},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5875421762466431},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.548139214515686},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.5379112362861633},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.5315788984298706},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.5192782878875732},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.4272170066833496},{"id":"https://openalex.org/keywords/data-set","display_name":"Data set","score":0.41534921526908875},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.4115443229675293},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.37151044607162476},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.1178928017616272}],"concepts":[{"id":"https://openalex.org/C2780551164","wikidata":"https://www.wikidata.org/wiki/Q2306599","display_name":"Column (typography)","level":3,"score":0.8518080711364746},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7492492198944092},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.6675742268562317},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6024661660194397},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5875421762466431},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.548139214515686},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.5379112362861633},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.5315788984298706},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5192782878875732},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4272170066833496},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.41534921526908875},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4115443229675293},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.37151044607162476},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1178928017616272},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C59822182","wikidata":"https://www.wikidata.org/wiki/Q441","display_name":"Botany","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/ssci50451.2021.9659897","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ssci50451.2021.9659897","pdf_url":null,"source":{"id":"https://openalex.org/S4363604921","display_name":"2021 IEEE Symposium Series on Computational Intelligence (SSCI)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Symposium Series on Computational Intelligence (SSCI)","raw_type":"proceedings-article"},{"id":"pmh:oai:HAL:hal-03548697v1","is_oa":false,"landing_page_url":"https://hal.science/hal-03548697","pdf_url":null,"source":{"id":"https://openalex.org/S4306402512","display_name":"HAL (Le Centre pour la Communication Scientifique Directe)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1294671590","host_organization_name":"Centre National de la Recherche Scientifique","host_organization_lineage":["https://openalex.org/I1294671590"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"2021 IEEE Symposium Series on Computational Intelligence (SSCI), Dec 2021, Orlando, United States. pp.1-7, &#x27E8;10.1109/SSCI50451.2021.9659897&#x27E9;","raw_type":"Conference papers"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W164606317","https://openalex.org/W1601435884","https://openalex.org/W2123402141","https://openalex.org/W2145073242","https://openalex.org/W2157444450","https://openalex.org/W2768348081","https://openalex.org/W2911964244","https://openalex.org/W2951621897","https://openalex.org/W2964022491","https://openalex.org/W3082424964","https://openalex.org/W3155575368","https://openalex.org/W3158963140","https://openalex.org/W4229641819","https://openalex.org/W4240301789","https://openalex.org/W6606702875","https://openalex.org/W6636065709","https://openalex.org/W6681651645","https://openalex.org/W6683161245","https://openalex.org/W6745609711"],"related_works":["https://openalex.org/W2181722423","https://openalex.org/W2347222412","https://openalex.org/W2085601491","https://openalex.org/W2375996887","https://openalex.org/W2786391746","https://openalex.org/W4381430104","https://openalex.org/W2995102745","https://openalex.org/W4226059458","https://openalex.org/W2914559142","https://openalex.org/W1990237101"],"abstract_inverted_index":{"Data":[0,13],"quality":[1,14],"is":[2,15,23],"a":[3,16,71,84,89],"global":[4],"issue":[5],"in":[6,33],"our":[7,21,45],"society":[8],"that":[9,58,62,86],"every":[10],"company":[11],"encounter.":[12],"vast":[17],"field":[18,35],"of":[19,54,100,108,130,147,154],"study,":[20,124],"work":[22,46],"focused":[24],"on":[25,47,105,111],"relational":[26],"Data.":[27],"A":[28],"lot":[29],"have":[30],"been":[31],"done":[32],"this":[34,95,123,139],"to":[36,66,73,97,114,132],"identify":[37],"duplicate":[38,56],"lines,":[39],"but":[40],"here":[41],"we":[42,125],"will":[43],"focus":[44],"columns.":[48],"We":[49,69,81,142],"define":[50],"the":[51,106,134,145,152,155],"new":[52],"concept":[53],"near":[55,79,120],"columns,":[57],"characterises":[59],"two":[60,76,117],"columns":[61,77,118],"are":[63,78,119],"really":[64],"similar":[65],"each":[67],"other.":[68],"introduce":[70],"method":[72,85,96,103],"determine":[74,115],"if":[75,116],"duplicate.":[80,121],"first":[82],"describe":[83],"works":[87],"for":[88,138],"specific":[90],"column":[91],"and":[92],"then":[93],"generalize":[94],"any":[98],"couple":[99],"column.":[101],"This":[102],"relies":[104],"use":[107],"classifiers":[109,131],"trained":[110],"artificial":[112,156],"data-set":[113],"In":[122],"try":[126],"multiple":[127],"possible":[128],"choices":[129],"find":[133],"most":[135],"appropriate":[136],"one":[137],"learning":[140],"problem.":[141],"also":[143],"explore":[144],"effect":[146],"modifying":[148],"experimental":[149],"parameters":[150],"during":[151],"generation":[153],"training":[157],"data.":[158]},"counts_by_year":[{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
