{"id":"https://openalex.org/W4416146274","doi":"https://doi.org/10.1145/3767712","title":"Clustering with Set Outliers and Applications in Relational Clustering","display_name":"Clustering with Set Outliers and Applications in Relational Clustering","publication_year":2025,"publication_date":"2025-11-10","ids":{"openalex":"https://openalex.org/W4416146274","doi":"https://doi.org/10.1145/3767712"},"language":"en","primary_location":{"id":"doi:10.1145/3767712","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3767712","pdf_url":null,"source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5072972143","display_name":"Vaishali Surianarayanan","orcid":"https://orcid.org/0000-0003-3091-3823"},"institutions":[{"id":"https://openalex.org/I185103710","display_name":"University of California, Santa Cruz","ror":"https://ror.org/03s65by71","country_code":"US","type":"education","lineage":["https://openalex.org/I185103710"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Vaishali Surianarayanan","raw_affiliation_strings":["University of California, Santa Cruz, Santa Cruz, USA"],"raw_orcid":"https://orcid.org/0000-0003-3091-3823","affiliations":[{"raw_affiliation_string":"University of California, Santa Cruz, Santa Cruz, USA","institution_ids":["https://openalex.org/I185103710"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020253720","display_name":"Neeraj Kumar","orcid":"https://orcid.org/0000-0001-9356-526X"},"institutions":[{"id":"https://openalex.org/I4210114444","display_name":"Meta (United States)","ror":"https://ror.org/01zbnvs85","country_code":"US","type":"company","lineage":["https://openalex.org/I4210114444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Neeraj Kumar","raw_affiliation_strings":["Meta Platforms, Menlo Park, USA"],"raw_orcid":"https://orcid.org/0000-0001-9356-526X","affiliations":[{"raw_affiliation_string":"Meta Platforms, Menlo Park, USA","institution_ids":["https://openalex.org/I4210114444"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5015883931","display_name":"Stavros Sintos","orcid":"https://orcid.org/0000-0002-2114-8886"},"institutions":[{"id":"https://openalex.org/I39422238","display_name":"University of Illinois Chicago","ror":"https://ror.org/02mpq6x41","country_code":"US","type":"education","lineage":["https://openalex.org/I39422238"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Stavros Sintos","raw_affiliation_strings":["University of Illinois Chicago, Chicago, USA"],"raw_orcid":"https://orcid.org/0000-0002-2114-8886","affiliations":[{"raw_affiliation_string":"University of Illinois Chicago, Chicago, USA","institution_ids":["https://openalex.org/I39422238"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.7588,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.89501137,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":"3","issue":"5","first_page":"1","last_page":"27"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.20350000262260437,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.20350000262260437,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12536","display_name":"Topological and Geometric Data Analysis","score":0.17499999701976776,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10637","display_name":"Advanced Clustering Algorithms Research","score":0.13269999623298645,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.7343999743461609},{"id":"https://openalex.org/keywords/outlier","display_name":"Outlier","score":0.5522000193595886},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.4797999858856201},{"id":"https://openalex.org/keywords/cure-data-clustering-algorithm","display_name":"CURE data clustering algorithm","score":0.43220001459121704},{"id":"https://openalex.org/keywords/data-point","display_name":"Data point","score":0.4156000018119812},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.41130000352859497},{"id":"https://openalex.org/keywords/data-stream-clustering","display_name":"Data stream clustering","score":0.39910000562667847},{"id":"https://openalex.org/keywords/correlation-clustering","display_name":"Correlation clustering","score":0.3959999978542328},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.3741999864578247}],"concepts":[{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.7343999743461609},{"id":"https://openalex.org/C79337645","wikidata":"https://www.wikidata.org/wiki/Q779824","display_name":"Outlier","level":2,"score":0.5522000193595886},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.4797999858856201},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4634000062942505},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.4560999870300293},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4366999864578247},{"id":"https://openalex.org/C33704608","wikidata":"https://www.wikidata.org/wiki/Q5014717","display_name":"CURE data clustering algorithm","level":4,"score":0.43220001459121704},{"id":"https://openalex.org/C21080849","wikidata":"https://www.wikidata.org/wiki/Q13611879","display_name":"Data point","level":2,"score":0.4156000018119812},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.41130000352859497},{"id":"https://openalex.org/C193143536","wikidata":"https://www.wikidata.org/wiki/Q5227360","display_name":"Data stream clustering","level":5,"score":0.39910000562667847},{"id":"https://openalex.org/C94641424","wikidata":"https://www.wikidata.org/wiki/Q5172845","display_name":"Correlation clustering","level":3,"score":0.3959999978542328},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.3741999864578247},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.374099999666214},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.37400001287460327},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.36480000615119934},{"id":"https://openalex.org/C27964816","wikidata":"https://www.wikidata.org/wiki/Q5164359","display_name":"Constrained clustering","level":5,"score":0.3562000095844269},{"id":"https://openalex.org/C104047586","wikidata":"https://www.wikidata.org/wiki/Q5033439","display_name":"Canopy clustering algorithm","level":4,"score":0.34290000796318054},{"id":"https://openalex.org/C198043062","wikidata":"https://www.wikidata.org/wiki/Q180953","display_name":"Metric space","level":2,"score":0.3269999921321869},{"id":"https://openalex.org/C148764684","wikidata":"https://www.wikidata.org/wiki/Q621751","display_name":"Approximation algorithm","level":2,"score":0.29670000076293945},{"id":"https://openalex.org/C5655090","wikidata":"https://www.wikidata.org/wiki/Q192588","display_name":"Relational database","level":2,"score":0.2937999963760376},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2847000062465668},{"id":"https://openalex.org/C22648726","wikidata":"https://www.wikidata.org/wiki/Q7523744","display_name":"Single-linkage clustering","level":5,"score":0.2825999855995178},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.28049999475479126},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.2768000066280365},{"id":"https://openalex.org/C311688","wikidata":"https://www.wikidata.org/wiki/Q2393193","display_name":"Time complexity","level":2,"score":0.27649998664855957},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.2728999853134155},{"id":"https://openalex.org/C184509293","wikidata":"https://www.wikidata.org/wiki/Q5136711","display_name":"Clustering high-dimensional data","level":3,"score":0.27090001106262207},{"id":"https://openalex.org/C739882","wikidata":"https://www.wikidata.org/wiki/Q3560506","display_name":"Anomaly detection","level":2,"score":0.2621999979019165},{"id":"https://openalex.org/C17212007","wikidata":"https://www.wikidata.org/wiki/Q5511111","display_name":"Fuzzy clustering","level":3,"score":0.25429999828338623},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2538999915122986}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3767712","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3767712","pdf_url":null,"source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":45,"referenced_works":["https://openalex.org/W166438205","https://openalex.org/W1523004878","https://openalex.org/W1992479406","https://openalex.org/W1994772769","https://openalex.org/W2004012902","https://openalex.org/W2008196645","https://openalex.org/W2023770210","https://openalex.org/W2032775418","https://openalex.org/W2060978628","https://openalex.org/W2091684877","https://openalex.org/W2104498537","https://openalex.org/W2120358419","https://openalex.org/W2122585291","https://openalex.org/W2133311553","https://openalex.org/W2137118456","https://openalex.org/W2148417962","https://openalex.org/W2150865801","https://openalex.org/W2169998163","https://openalex.org/W2220451813","https://openalex.org/W2247380138","https://openalex.org/W2284514301","https://openalex.org/W2604519798","https://openalex.org/W2767218854","https://openalex.org/W2950050514","https://openalex.org/W2962991166","https://openalex.org/W2963388857","https://openalex.org/W2997488476","https://openalex.org/W2997810140","https://openalex.org/W3030153809","https://openalex.org/W3144298286","https://openalex.org/W3169940233","https://openalex.org/W3177418610","https://openalex.org/W4250042253","https://openalex.org/W4285136459","https://openalex.org/W4313351227","https://openalex.org/W4313476522","https://openalex.org/W4378713432","https://openalex.org/W4384705345","https://openalex.org/W4386796904","https://openalex.org/W4388669487","https://openalex.org/W4394654141","https://openalex.org/W4404126146","https://openalex.org/W4404126154","https://openalex.org/W4404130423","https://openalex.org/W4409150735"],"related_works":[],"abstract_inverted_index":{"We":[0,150,235,267,278,307],"introduce":[1],"and":[2,14,62,76,93,147,162,175,206,280,294,320],"study":[3,281],"the":[4,18,80,152,183,225,242,256,304],"k":[5,20,88],"-center":[6,21],"clustering":[7,22,198,257,275,319],"problem":[8,158,244],"with":[9,23,276],"set":[10,59,86],"outliers,":[11],"a":[12,40,51,58,64,67,85,94,189,238,292,314],"natural":[13],"practical":[15],"generalization":[16],"of":[17,26,43,60,69,87,96,186,228],"classical":[19],"outliers.":[24,277],"Instead":[25],"removing":[27],"individual":[28],"data":[29,139,145],"points,":[30],"our":[31,229],"model":[32,271],"allows":[33],"discarding":[34],"up":[35,171],"to":[36,83,109,172,209,250],"z":[37,97,177,264],"subsets":[38],"from":[39],"given":[41,50],"family":[42,68,95],"candidate":[44],"outlier":[45,178,265],"sets":[46,70,98,179,187],"H.":[47],"More":[48],"formally,":[49],"metric":[52],"space":[53],"(P,dist),":[54],"where":[55,286,296],"P":[56,74,92],"is":[57,82,182,248],"elements":[61],"dist":[63,123],"distance":[65],"metric,":[66],"H":[71,99,101,107,117],"\u2286":[72,91,100],"2":[73],",":[75],"parameters":[77],"k,":[78],"z,":[79],"goal":[81],"compute":[84],"centers":[89,174],"C":[90,122],"such":[102,136],"that":[103,188,246,269],"C\u2229(\u22c3":[104],"h":[105,115],"\u2208":[106,113,116,121],"h)=\u2205":[108],"minimize":[110],"max":[111],"p":[112],"P\u2216(\u22c3":[114],"h)":[118],"min":[119],"c":[120],"(p,c)":[124],"(clustering":[125],"cost).":[126],"This":[127],"abstraction":[128],"captures":[129,273],"structured":[130],"noise":[131],"common":[132],"in":[133,144,159,197,291],"database":[134],"applications,":[135],"as":[137],"faulty":[138],"sources":[140],"or":[141],"corrupted":[142],"records":[143],"integration":[146],"sensor":[148],"systems.":[149],"present":[151],"first":[153],"approximation":[154,196,254,309],"algorithms":[155,230,310],"for":[156,241,311],"this":[157,220,270],"both":[160],"general":[161,243],"geometric":[163,201],"settings.":[164],"Our":[165],"methods":[166],"provide":[167,237,308],"tri-criteria":[168],"approximations:":[169],"selecting":[170,259],"2k":[173],"2f":[176],"(where":[180],"f":[181,262],"maximum":[184],"number":[185],"point":[190],"belongs":[191],"to),":[192],"while":[193],"achieving":[194],"constant-factor":[195],"cost.":[199],"In":[200,214,219],"settings,":[202],"we":[203,222],"leverage":[204],"range":[205],"BBD":[207],"trees":[208],"achieve":[210],"near-linear":[211],"time":[212,227],"algorithms.":[213],"many":[215],"real":[216],"applications":[217],"f=1.":[218],"case":[221],"further":[223],"improve":[224],"running":[226],"by":[231],"constructing":[232],"small":[233],"coresets.":[234],"also":[236],"hardness":[239],"result":[240,289],"showing":[245],"it":[247],"unlikely":[249],"get":[251],"any":[252],"sublinear":[253],"on":[255],"cost":[258],"less":[260],"than":[261],"\u2022":[263],"sets.":[266],"demonstrate":[268],"naturally":[272],"relational":[274,321],"define":[279],"two":[282],"new":[283],"formulations:":[284],"one":[285],"outliers":[287,297],"are":[288,298],"tuples":[290,300],"join,":[293],"another":[295],"input":[299],"whose":[301],"removal":[302],"affects":[303],"join":[305],"output.":[306],"both,":[312],"establishing":[313],"tight":[315],"connection":[316],"between":[317],"robust":[318],"query":[322],"evaluation.":[323]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-11-12T00:00:00"}
