{"id":"https://openalex.org/W2889249015","doi":"https://doi.org/10.14778/3229863.3229867","title":"Automating large-scale data quality verification","display_name":"Automating large-scale data quality verification","publication_year":2018,"publication_date":"2018-08-01","ids":{"openalex":"https://openalex.org/W2889249015","doi":"https://doi.org/10.14778/3229863.3229867","mag":"2889249015"},"language":"en","primary_location":{"id":"doi:10.14778/3229863.3229867","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3229863.3229867","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5090934117","display_name":"Sebastian Schelter","orcid":"https://orcid.org/0000-0003-4722-5840"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sebastian Schelter","raw_affiliation_strings":["Amazon Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070190076","display_name":"Dustin Lange","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dustin Lange","raw_affiliation_strings":["Amazon Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109246682","display_name":"Philipp Schmidt","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Philipp Schmidt","raw_affiliation_strings":["Amazon Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048334005","display_name":"Meltem Celikel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Meltem Celikel","raw_affiliation_strings":["Amazon Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon Research","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024953386","display_name":"Felix Bie\u00dfmann","orcid":"https://orcid.org/0000-0002-3422-1026"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Felix Biessmann","raw_affiliation_strings":["Amazon Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Amazon Research","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5065371205","display_name":"Andreas Grafberger","orcid":null},"institutions":[{"id":"https://openalex.org/I179225836","display_name":"University of Augsburg","ror":"https://ror.org/03p14d497","country_code":"DE","type":"education","lineage":["https://openalex.org/I179225836"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Andreas Grafberger","raw_affiliation_strings":["University of Augsburg and Amazon Research"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Augsburg and Amazon Research","institution_ids":["https://openalex.org/I179225836"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":14.7515,"has_fulltext":false,"cited_by_count":215,"citation_normalized_percentile":{"value":0.99093025,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":"11","issue":"12","first_page":"1781","last_page":"1794"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11891","display_name":"Big Data and Business Intelligence","score":0.994700014591217,"subfield":{"id":"https://openalex.org/subfields/1404","display_name":"Management Information Systems"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8261442184448242},{"id":"https://openalex.org/keywords/data-quality","display_name":"Data quality","score":0.591938853263855},{"id":"https://openalex.org/keywords/data-validation","display_name":"Data validation","score":0.5816976428031921},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5086332559585571},{"id":"https://openalex.org/keywords/predictability","display_name":"Predictability","score":0.5046185255050659},{"id":"https://openalex.org/keywords/spark","display_name":"SPARK (programming language)","score":0.49687817692756653},{"id":"https://openalex.org/keywords/data-integrity","display_name":"Data integrity","score":0.4927728474140167},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4770773649215698},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.47645214200019836},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.4569348692893982},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.45059734582901},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.11832728981971741},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.09605869650840759}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8261442184448242},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.591938853263855},{"id":"https://openalex.org/C92446256","wikidata":"https://www.wikidata.org/wiki/Q3306762","display_name":"Data validation","level":2,"score":0.5816976428031921},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5086332559585571},{"id":"https://openalex.org/C197640229","wikidata":"https://www.wikidata.org/wiki/Q2534066","display_name":"Predictability","level":2,"score":0.5046185255050659},{"id":"https://openalex.org/C2781215313","wikidata":"https://www.wikidata.org/wiki/Q3493345","display_name":"SPARK (programming language)","level":2,"score":0.49687817692756653},{"id":"https://openalex.org/C33762810","wikidata":"https://www.wikidata.org/wiki/Q461671","display_name":"Data integrity","level":2,"score":0.4927728474140167},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4770773649215698},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.47645214200019836},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.4569348692893982},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.45059734582901},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.11832728981971741},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.09605869650840759},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.14778/3229863.3229867","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3229863.3229867","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},{"id":"pmh:oai:uni-augsburg.opus-bayern.de:45846","is_oa":false,"landing_page_url":"https://opus.bibliothek.uni-augsburg.de/opus4/frontdoor/index/index/docId/45846","pdf_url":null,"source":{"id":"https://openalex.org/S4306400930","display_name":"OPUS (Augsburg University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I119916105","host_organization_name":"Augsburg University","host_organization_lineage":["https://openalex.org/I119916105"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":51,"referenced_works":["https://openalex.org/W144900376","https://openalex.org/W760598031","https://openalex.org/W1485156179","https://openalex.org/W1982092405","https://openalex.org/W1984566373","https://openalex.org/W2004291985","https://openalex.org/W2038412523","https://openalex.org/W2063103859","https://openalex.org/W2101234009","https://openalex.org/W2103018059","https://openalex.org/W2112452856","https://openalex.org/W2119803607","https://openalex.org/W2132862423","https://openalex.org/W2140336868","https://openalex.org/W2153531471","https://openalex.org/W2170712852","https://openalex.org/W2189162242","https://openalex.org/W2189465200","https://openalex.org/W2238711864","https://openalex.org/W2357449897","https://openalex.org/W2432911982","https://openalex.org/W2437617937","https://openalex.org/W2437990191","https://openalex.org/W2438792749","https://openalex.org/W2439326083","https://openalex.org/W2547386789","https://openalex.org/W2584580687","https://openalex.org/W2591700809","https://openalex.org/W2607045400","https://openalex.org/W2611090425","https://openalex.org/W2611130659","https://openalex.org/W2613597870","https://openalex.org/W2614986686","https://openalex.org/W2743948853","https://openalex.org/W2752857821","https://openalex.org/W2753069234","https://openalex.org/W2767280887","https://openalex.org/W2771375649","https://openalex.org/W2782864149","https://openalex.org/W2792538726","https://openalex.org/W2792572948","https://openalex.org/W2811507150","https://openalex.org/W2962843773","https://openalex.org/W2963288913","https://openalex.org/W2997591727","https://openalex.org/W3004286518","https://openalex.org/W4242142158","https://openalex.org/W4253407506","https://openalex.org/W4285719527","https://openalex.org/W4293582904","https://openalex.org/W6687322159"],"related_works":["https://openalex.org/W3041894995","https://openalex.org/W2146933441","https://openalex.org/W2083062572","https://openalex.org/W3175087045","https://openalex.org/W4310519870","https://openalex.org/W2976128099","https://openalex.org/W1485984927","https://openalex.org/W4205770829","https://openalex.org/W4212820174","https://openalex.org/W2334967743"],"abstract_inverted_index":{"Modern":[0],"companies":[1],"and":[2,13,81,117,133,153],"institutions":[3],"rely":[4],"on":[5,102,114,158],"data":[6,35,54,112,139],"to":[7,38,99],"guide":[8],"every":[9],"single":[10],"business":[11],"process":[12,23],"decision.":[14],"Missing":[15],"or":[16],"incorrect":[17],"information":[18],"seriously":[19],"compromises":[20],"any":[21],"decision":[22],"downstream.":[24],"Therefore,":[25],"a":[26,47,69,131],"crucial,":[27],"but":[28],"tedious":[29],"task":[30],"for":[31,49,86,122,126,134],"everyone":[32],"involved":[33],"in":[34,137],"processing":[36],"is":[37],"verify":[39],"the":[40,51,60,91,108,128,149],"quality":[41,55,75,113,140],"of":[42,53,62,111,130],"their":[43],"data.":[44,87],"We":[45,88,143],"present":[46,154],"system":[48,67,151],"automating":[50],"verification":[52],"at":[56],"scale,":[57],"which":[58,72],"meets":[59],"requirements":[61],"production":[63],"use":[64],"cases.":[65],"Our":[66,105],"provides":[68],"declarative":[70],"API,":[71],"combines":[73],"common":[74],"constraints":[76],"with":[77],"user-defined":[78],"validation":[79,94,110],"code,":[80],"thereby":[82],"enables":[83],"'unit":[84],"tests'":[85],"efficiently":[89],"execute":[90],"resulting":[92,150],"constraint":[93,124],"workload":[95],"by":[96],"translating":[97],"it":[98],"aggregation":[100],"queries":[101],"Apache":[103],"Spark.":[104],"platform":[106],"supports":[107],"incremental":[109],"growing":[115],"datasets,":[116],"leverages":[118],"machine":[119],"learning,":[120],"e.g.,":[121],"enhancing":[123],"suggestions,":[125],"estimating":[127],"'predictability'":[129],"column,":[132],"detecting":[135],"anomalies":[136],"historic":[138],"time":[141],"series.":[142],"discuss":[144],"our":[145],"design":[146],"decisions,":[147],"describe":[148],"architecture,":[152],"an":[155],"experimental":[156],"evaluation":[157],"various":[159],"datasets.":[160]},"counts_by_year":[{"year":2026,"cited_by_count":17},{"year":2025,"cited_by_count":33},{"year":2024,"cited_by_count":41},{"year":2023,"cited_by_count":28},{"year":2022,"cited_by_count":29},{"year":2021,"cited_by_count":29},{"year":2020,"cited_by_count":23},{"year":2019,"cited_by_count":13},{"year":2016,"cited_by_count":1},{"year":2012,"cited_by_count":1}],"updated_date":"2026-06-12T08:23:45.883708","created_date":"2025-10-10T00:00:00"}
