{"id":"https://openalex.org/W4387848581","doi":"https://doi.org/10.1145/3583780.3614786","title":"Automatic and Precise Data Validation for Machine Learning","display_name":"Automatic and Precise Data Validation for Machine Learning","publication_year":2023,"publication_date":"2023-10-21","ids":{"openalex":"https://openalex.org/W4387848581","doi":"https://doi.org/10.1145/3583780.3614786"},"language":"en","primary_location":{"id":"doi:10.1145/3583780.3614786","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3583780.3614786","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3583780.3614786","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3583780.3614786","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5023956968","display_name":"Shreya Shankar","orcid":"https://orcid.org/0000-0002-0919-9672"},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Shreya Shankar","raw_affiliation_strings":["University of California, Berkeley, Berkeley, CA, USA"],"affiliations":[{"raw_affiliation_string":"University of California, Berkeley, Berkeley, CA, USA","institution_ids":["https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055260819","display_name":"Labib Fawaz","orcid":"https://orcid.org/0009-0002-6543-112X"},"institutions":[{"id":"https://openalex.org/I4210099336","display_name":"Menlo School","ror":"https://ror.org/01240pn49","country_code":"US","type":"education","lineage":["https://openalex.org/I4210099336"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Labib Fawaz","raw_affiliation_strings":["Meta, Menlo Park, CA, USA"],"affiliations":[{"raw_affiliation_string":"Meta, Menlo Park, CA, USA","institution_ids":["https://openalex.org/I4210099336"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010337815","display_name":"Karl Gyllstrom","orcid":"https://orcid.org/0009-0003-6594-3552"},"institutions":[{"id":"https://openalex.org/I4210099336","display_name":"Menlo School","ror":"https://ror.org/01240pn49","country_code":"US","type":"education","lineage":["https://openalex.org/I4210099336"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Karl Gyllstrom","raw_affiliation_strings":["Meta, Menlo Park, CA, USA"],"affiliations":[{"raw_affiliation_string":"Meta, Menlo Park, CA, USA","institution_ids":["https://openalex.org/I4210099336"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5013608601","display_name":"Aditya Parameswaran","orcid":"https://orcid.org/0000-0002-4538-4752"},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Aditya Parameswaran","raw_affiliation_strings":["University of California, Berkeley, Berkeley, CA, USA"],"affiliations":[{"raw_affiliation_string":"University of California, Berkeley, Berkeley, CA, USA","institution_ids":["https://openalex.org/I95457486"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5023956968"],"corresponding_institution_ids":["https://openalex.org/I95457486"],"apc_list":null,"apc_paid":null,"fwci":2.4924,"has_fulltext":true,"cited_by_count":10,"citation_normalized_percentile":{"value":0.89791578,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"2198","last_page":"2207"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8062653541564941},{"id":"https://openalex.org/keywords/data-validation","display_name":"Data validation","score":0.5782907009124756},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5635477900505066},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.561455488204956},{"id":"https://openalex.org/keywords/automatic-summarization","display_name":"Automatic summarization","score":0.545515775680542},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.49277499318122864},{"id":"https://openalex.org/keywords/timestamp","display_name":"Timestamp","score":0.4680846333503723},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.46467381715774536},{"id":"https://openalex.org/keywords/partition","display_name":"Partition (number theory)","score":0.43699344992637634},{"id":"https://openalex.org/keywords/labeled-data","display_name":"Labeled data","score":0.428332656621933},{"id":"https://openalex.org/keywords/real-time-computing","display_name":"Real-time computing","score":0.1449814736843109},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.11240014433860779}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8062653541564941},{"id":"https://openalex.org/C92446256","wikidata":"https://www.wikidata.org/wiki/Q3306762","display_name":"Data validation","level":2,"score":0.5782907009124756},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5635477900505066},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.561455488204956},{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.545515775680542},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.49277499318122864},{"id":"https://openalex.org/C113954288","wikidata":"https://www.wikidata.org/wiki/Q186885","display_name":"Timestamp","level":2,"score":0.4680846333503723},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46467381715774536},{"id":"https://openalex.org/C42812","wikidata":"https://www.wikidata.org/wiki/Q1082910","display_name":"Partition (number theory)","level":2,"score":0.43699344992637634},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.428332656621933},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.1449814736843109},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.11240014433860779},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3583780.3614786","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3583780.3614786","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3583780.3614786","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3583780.3614786","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3583780.3614786","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3583780.3614786","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Information and Knowledge Management","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.5199999809265137,"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9"}],"awards":[{"id":"https://openalex.org/G101112439","display_name":null,"funder_award_id":"Fellow","funder_id":"https://openalex.org/F4320306151","funder_display_name":"Alfred P. Sloan Foundation"},{"id":"https://openalex.org/G1990441842","display_name":"CAREER: Advancing Open-Ended Crowdsourcing: The Next Frontier in Crowdsourced Data Management","funder_award_id":"1940757","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G3027998694","display_name":null,"funder_award_id":"IIS-2129008","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G3432830374","display_name":"AitF: Collaborative Research: Fast, Accurate, and Practical: Adaptive Sublinear Algorithms for Scalable Visualization","funder_award_id":"1940759","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G4257867673","display_name":null,"funder_award_id":"IIS-1940759","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G6204381899","display_name":"FW-HTF-R: Human-Machine Teaming for Effective Data Work at Scale: Upskilling Defense Lawyers Working with Police and Court Process Data","funder_award_id":"2129008","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G6621079209","display_name":null,"funder_award_id":"NDSEG","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G6894402473","display_name":null,"funder_award_id":"Fellowship","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G848032724","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8667236997","display_name":null,"funder_award_id":"2243822","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8926491534","display_name":null,"funder_award_id":"Fellowship","funder_id":"https://openalex.org/F4320333566","funder_display_name":"National Defense Science and Engineering Graduate"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320306151","display_name":"Alfred P. Sloan Foundation","ror":"https://ror.org/052csg198"},{"id":"https://openalex.org/F4320333566","display_name":"National Defense Science and Engineering Graduate","ror":null}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4387848581.pdf","grobid_xml":"https://content.openalex.org/works/W4387848581.grobid-xml"},"referenced_works_count":36,"referenced_works":["https://openalex.org/W1530232915","https://openalex.org/W1992479406","https://openalex.org/W2048430744","https://openalex.org/W2063103859","https://openalex.org/W2086234878","https://openalex.org/W2095897464","https://openalex.org/W2122646361","https://openalex.org/W2129281431","https://openalex.org/W2153233077","https://openalex.org/W2210827128","https://openalex.org/W2328111639","https://openalex.org/W2437617937","https://openalex.org/W2475334473","https://openalex.org/W2544486974","https://openalex.org/W2548122763","https://openalex.org/W2743948853","https://openalex.org/W2889249015","https://openalex.org/W2891345706","https://openalex.org/W2893303656","https://openalex.org/W2905588001","https://openalex.org/W2949762319","https://openalex.org/W2968778390","https://openalex.org/W2999905431","https://openalex.org/W3010653466","https://openalex.org/W3015276915","https://openalex.org/W3030932193","https://openalex.org/W3103264664","https://openalex.org/W3104534434","https://openalex.org/W3144571334","https://openalex.org/W3155567600","https://openalex.org/W3160786003","https://openalex.org/W3166319166","https://openalex.org/W4251670979","https://openalex.org/W4255375128","https://openalex.org/W4296563717","https://openalex.org/W4312645701"],"related_works":["https://openalex.org/W2366403280","https://openalex.org/W1495108544","https://openalex.org/W2091301346","https://openalex.org/W3148229873","https://openalex.org/W4389760904","https://openalex.org/W2150160875","https://openalex.org/W4242223894","https://openalex.org/W4306886878","https://openalex.org/W1517524280","https://openalex.org/W2060561905"],"abstract_inverted_index":{"Machine":[0],"learning":[1],"(ML)":[2],"models":[3],"in":[4,24,135,165],"production":[5],"pipelines":[6],"are":[7,54,69,118],"frequently":[8],"retrained":[9],"on":[10,173],"the":[11,168],"latest":[12],"partitions":[13,23],"of":[14,108],"large,":[15],"continually-":[16],"growing":[17],"datasets.":[18],"Due":[19],"to":[20,36,56,102,120,127],"engineering":[21],"bugs,":[22],"such":[25],"datasets":[26],"almost":[27],"always":[28],"have":[29],"some":[30],"corrupted":[31,122],"features;":[32],"thus,":[33],"it's":[34],"critical":[35],"find":[37],"data":[38,51,84,103,109,113,132,153],"issues":[39],"and":[40,82,116,139],"block":[41],"retraining":[42],"before":[43],"downstream":[44],"ML":[45,50,88],"accuracy":[46],"decreases.":[47],"However,":[48],"current":[49],"validation":[52,85,133,154],"methods":[53],"difficult":[55],"operationalize:":[57],"they":[58],"yield":[59],"too":[60],"many":[61],"false":[62],"positive":[63],"alerts,":[64],"require":[65],"manual":[66],"tuning,":[67],"or":[68],"infeasible":[70],"at":[71],"scale.":[72],"In":[73],"this":[74],"pa-":[75],"per,":[76],"we":[77,95,149],"present":[78,150],"an":[79],"automatic,":[80],"precise,":[81],"scalable":[83],"system":[86],"for":[87,130],"pipelines,":[89],"employing":[90],"a":[91,97,136,161,174],"simple":[92],"idea":[93],"that":[94,156],"call":[96],"Partition":[98],"Summarization":[99],"(PS)":[100],"approach":[101],"validation:":[104],"each":[105],"timestamp-based":[106],"partition":[107],"is":[110],"summarized":[111],"with":[112],"quality":[114],"metrics,":[115],"summaries":[117],"compared":[119],"detect":[121],"partitions.":[123],"We":[124],"demonstrate":[125],"how":[126],"adapt":[128],"PS":[129],"any":[131],"method":[134,155],"robust":[137],"manner":[138],"evaluate":[140],"several":[141],"adaptations-which":[142],"by":[143],"themselves":[144],"provide":[145],"limited":[146],"precision.":[147],"Finally,":[148],"gate,":[151],"our":[152,178],"leverages":[157],"these":[158],"adaptations,":[159],"giving":[160],"2.1\u00d7":[162],"average":[163],"improvement":[164],"precision":[166],"over":[167],"baseline":[169],"from":[170],"prior":[171],"work":[172],"case":[175],"study":[176],"within":[177],"large":[179],"tech":[180],"company.":[181]},"counts_by_year":[{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":3}],"updated_date":"2026-04-11T08:14:18.477133","created_date":"2025-10-10T00:00:00"}
