{"id":"https://openalex.org/W4280494175","doi":"https://doi.org/10.1007/s13222-022-00413-2","title":"Data Cleaning and AutoML: Would an Optimizer Choose to Clean?","display_name":"Data Cleaning and AutoML: Would an Optimizer Choose to Clean?","publication_year":2022,"publication_date":"2022-05-13","ids":{"openalex":"https://openalex.org/W4280494175","doi":"https://doi.org/10.1007/s13222-022-00413-2"},"language":"en","primary_location":{"id":"doi:10.1007/s13222-022-00413-2","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s13222-022-00413-2","pdf_url":"https://link.springer.com/content/pdf/10.1007/s13222-022-00413-2.pdf","source":{"id":"https://openalex.org/S73012565","display_name":"Datenbank-Spektrum","issn_l":"1610-1995","issn":["1610-1995","1618-2162"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Datenbank-Spektrum","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/s13222-022-00413-2.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5028895029","display_name":"Felix Neutatz","orcid":"https://orcid.org/0000-0001-8698-8010"},"institutions":[{"id":"https://openalex.org/I4577782","display_name":"Technische Universit\u00e4t Berlin","ror":"https://ror.org/03v4gjf40","country_code":"DE","type":"education","lineage":["https://openalex.org/I4577782"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Felix Neutatz","raw_affiliation_strings":["TU Berlin, Berlin, Germany"],"affiliations":[{"raw_affiliation_string":"TU Berlin, Berlin, Germany","institution_ids":["https://openalex.org/I4577782"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028469663","display_name":"Binger Chen","orcid":"https://orcid.org/0009-0007-7153-6100"},"institutions":[{"id":"https://openalex.org/I4577782","display_name":"Technische Universit\u00e4t Berlin","ror":"https://ror.org/03v4gjf40","country_code":"DE","type":"education","lineage":["https://openalex.org/I4577782"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Binger Chen","raw_affiliation_strings":["TU Berlin, Berlin, Germany"],"affiliations":[{"raw_affiliation_string":"TU Berlin, Berlin, Germany","institution_ids":["https://openalex.org/I4577782"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004635370","display_name":"Yazan Alkhatib","orcid":null},"institutions":[{"id":"https://openalex.org/I114112103","display_name":"Leibniz University Hannover","ror":"https://ror.org/0304hq317","country_code":"DE","type":"education","lineage":["https://openalex.org/I114112103"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Yazan Alkhatib","raw_affiliation_strings":["Leibniz Universit\u00e4t Hannover, Hannover, Germany"],"affiliations":[{"raw_affiliation_string":"Leibniz Universit\u00e4t Hannover, Hannover, Germany","institution_ids":["https://openalex.org/I114112103"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057046953","display_name":"Jingwen Ye","orcid":"https://orcid.org/0000-0001-8415-3597"},"institutions":[{"id":"https://openalex.org/I114112103","display_name":"Leibniz University Hannover","ror":"https://ror.org/0304hq317","country_code":"DE","type":"education","lineage":["https://openalex.org/I114112103"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Jingwen Ye","raw_affiliation_strings":["Leibniz Universit\u00e4t Hannover, Hannover, Germany"],"affiliations":[{"raw_affiliation_string":"Leibniz Universit\u00e4t Hannover, Hannover, Germany","institution_ids":["https://openalex.org/I114112103"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5009128577","display_name":"Ziawasch Abedjan","orcid":"https://orcid.org/0000-0002-2846-1373"},"institutions":[{"id":"https://openalex.org/I114112103","display_name":"Leibniz University Hannover","ror":"https://ror.org/0304hq317","country_code":"DE","type":"education","lineage":["https://openalex.org/I114112103"]},{"id":"https://openalex.org/I4210136150","display_name":"L3S Research Center","ror":"https://ror.org/039t4wk02","country_code":"DE","type":"facility","lineage":["https://openalex.org/I114112103","https://openalex.org/I4210136150","https://openalex.org/I94509681"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Ziawasch Abedjan","raw_affiliation_strings":["L3S Research Center, Hannover, Germany","Leibniz Universit\u00e4t Hannover, Hannover, Germany"],"affiliations":[{"raw_affiliation_string":"L3S Research Center, Hannover, Germany","institution_ids":["https://openalex.org/I4210136150"]},{"raw_affiliation_string":"Leibniz Universit\u00e4t Hannover, Hannover, Germany","institution_ids":["https://openalex.org/I114112103"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5028895029"],"corresponding_institution_ids":["https://openalex.org/I4577782"],"apc_list":{"value":2380,"currency":"EUR","value_usd":2890},"apc_paid":{"value":2380,"currency":"EUR","value_usd":2890},"fwci":3.6087,"has_fulltext":true,"cited_by_count":27,"citation_normalized_percentile":{"value":0.93651414,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":"22","issue":"2","first_page":"121","last_page":"130"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11652","display_name":"Imbalanced Data Classification Techniques","score":0.9909999966621399,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11871","display_name":"Advanced Statistical Methods and Models","score":0.9843000173568726,"subfield":{"id":"https://openalex.org/subfields/2613","display_name":"Statistics and Probability"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/outlier","display_name":"Outlier","score":0.6921884417533875},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6504330039024353},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5915381908416748},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.5441802740097046},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.5077815651893616},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.480704128742218},{"id":"https://openalex.org/keywords/data-quality","display_name":"Data quality","score":0.47109872102737427},{"id":"https://openalex.org/keywords/model-selection","display_name":"Model selection","score":0.4553946852684021},{"id":"https://openalex.org/keywords/feature-selection","display_name":"Feature selection","score":0.44602859020233154},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.4181622266769409},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.4023526906967163},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.10564664006233215}],"concepts":[{"id":"https://openalex.org/C79337645","wikidata":"https://www.wikidata.org/wiki/Q779824","display_name":"Outlier","level":2,"score":0.6921884417533875},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6504330039024353},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5915381908416748},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5441802740097046},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.5077815651893616},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.480704128742218},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.47109872102737427},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.4553946852684021},{"id":"https://openalex.org/C148483581","wikidata":"https://www.wikidata.org/wiki/Q446488","display_name":"Feature selection","level":2,"score":0.44602859020233154},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.4181622266769409},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4023526906967163},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.10564664006233215},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/s13222-022-00413-2","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s13222-022-00413-2","pdf_url":"https://link.springer.com/content/pdf/10.1007/s13222-022-00413-2.pdf","source":{"id":"https://openalex.org/S73012565","display_name":"Datenbank-Spektrum","issn_l":"1610-1995","issn":["1610-1995","1618-2162"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Datenbank-Spektrum","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1007/s13222-022-00413-2","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s13222-022-00413-2","pdf_url":"https://link.springer.com/content/pdf/10.1007/s13222-022-00413-2.pdf","source":{"id":"https://openalex.org/S73012565","display_name":"Datenbank-Spektrum","issn_l":"1610-1995","issn":["1610-1995","1618-2162"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Datenbank-Spektrum","raw_type":"journal-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure","score":0.46000000834465027}],"awards":[],"funders":[{"id":"https://openalex.org/F4320324094","display_name":"Technische Universit\u00e4t Berlin","ror":"https://ror.org/03v4gjf40"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4280494175.pdf","grobid_xml":"https://content.openalex.org/works/W4280494175.grobid-xml"},"referenced_works_count":14,"referenced_works":["https://openalex.org/W2056132907","https://openalex.org/W2096863518","https://openalex.org/W2115098571","https://openalex.org/W2135874888","https://openalex.org/W2167460663","https://openalex.org/W2170413589","https://openalex.org/W2548122763","https://openalex.org/W2911964244","https://openalex.org/W3082197983","https://openalex.org/W3156669901","https://openalex.org/W4240252876","https://openalex.org/W4243563432","https://openalex.org/W4243565423","https://openalex.org/W6601010567"],"related_works":["https://openalex.org/W3006513224","https://openalex.org/W2046456988","https://openalex.org/W2357409937","https://openalex.org/W2510582230","https://openalex.org/W2978674666","https://openalex.org/W2074430941","https://openalex.org/W2113096305","https://openalex.org/W1977636359","https://openalex.org/W2772305933","https://openalex.org/W2580722822"],"abstract_inverted_index":{"Abstract":[0],"Data":[1],"cleaning":[2,134,184],"is":[3,21,31,45,52,135],"widely":[4],"acknowledged":[5],"as":[6,81,141],"an":[7,142],"important":[8,32,47],"yet":[9],"tedious":[10],"task":[11],"when":[12,37,56],"dealing":[13],"with":[14],"large":[15],"amounts":[16],"of":[17,77,92,101,120,200,207],"data.":[18,67],"Thus,":[19],"there":[20],"always":[22],"a":[23,49,171,188,201],"cost-benefit":[24],"trade-off":[25,36],"to":[26,33,130,164],"consider.":[27],"In":[28,123,157,177],"particular,":[29,158],"it":[30],"assess":[34],"this":[35,124],"not":[38,136],"every":[39],"data":[40,43,93,103,133],"point":[41],"and":[42,114,154],"error":[44,104],"equally":[46],"for":[48],"task.":[50,176],"This":[51],"often":[53],"the":[54,74,78,99,118,128,167,198,205,211],"case":[55],"statistical":[57],"analysis":[58],"or":[59,83,204],"machine":[60],"learning":[61],"(ML)":[62],"models":[63],"derive":[64],"knowledge":[65],"about":[66,72],"If":[68],"we":[69,126,159,180],"only":[70],"care":[71],"maximizing":[73],"utility":[75],"score":[76],"applications,":[79],"such":[80,121],"accuracy":[82],"F1":[84],"scores,":[85],"many":[86,146],"tasks":[87],"can":[88,192],"afford":[89],"some":[90],"degree":[91],"quality":[94],"problems.":[95],"Recent":[96],"studies":[97],"analyzed":[98],"impact":[100,117],"various":[102],"types":[105],"on":[106],"vanilla":[107],"ML":[108,173],"tasks,":[109],"showing":[110],"that":[111,149,169,182],"missing":[112],"values":[113],"outliers":[115],"significantly":[116],"outcome":[119],"models.":[122],"paper,":[125],"expand":[127],"setting":[129],"one":[131],"where":[132],"considered":[137],"in":[138],"isolation":[139],"but":[140,191],"equal":[143],"parameter":[144],"among":[145],"other":[147],"hyper-parameters":[148],"influence":[150],"feature":[151],"selection,":[152],"regularization,":[153],"model":[155,203],"selection.":[156],"use":[160],"state-of-the-art":[161],"AutoML":[162],"frameworks":[163],"automatically":[165],"learn":[166],"parameters":[168],"benefit":[170],"particular":[172],"binary":[174],"classification":[175],"our":[178],"study,":[179],"see":[181],"specific":[183,202,208],"routines":[185],"still":[186],"play":[187],"significant":[189],"role":[190],"also":[193],"be":[194],"entirely":[195],"avoided":[196],"if":[197],"choice":[199],"filtering":[206],"features":[209],"diminishes":[210],"overall":[212],"impact.":[213]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":11},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":4}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
