{"id":"https://openalex.org/W4381328625","doi":"https://doi.org/10.1145/3589328","title":"DiffPrep: Differentiable Data Preprocessing Pipeline Search for Learning over Tabular Data","display_name":"DiffPrep: Differentiable Data Preprocessing Pipeline Search for Learning over Tabular Data","publication_year":2023,"publication_date":"2023-06-13","ids":{"openalex":"https://openalex.org/W4381328625","doi":"https://doi.org/10.1145/3589328"},"language":"en","primary_location":{"id":"doi:10.1145/3589328","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3589328","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3589328","source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"bronze","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3589328","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006439934","display_name":"Peng Li","orcid":"https://orcid.org/0009-0007-7150-7633"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Peng Li","raw_affiliation_strings":["Georgia Institute of Technology, Atlanta, GA, USA"],"affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100767024","display_name":"Zhiyi Chen","orcid":"https://orcid.org/0000-0002-5420-9956"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhiyi Chen","raw_affiliation_strings":["Georgia Institute of Technology, Atlanta, GA, USA"],"affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100430982","display_name":"Xu Chu","orcid":"https://orcid.org/0009-0007-3202-3767"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xu Chu","raw_affiliation_strings":["Georgia Institute of Technology, Atlanta, GA, USA"],"affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5001368248","display_name":"Kexin Rong","orcid":"https://orcid.org/0000-0002-3282-5360"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kexin Rong","raw_affiliation_strings":["Georgia Institute of Technology, Atlanta, GA, USA"],"affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5006439934"],"corresponding_institution_ids":["https://openalex.org/I130701444"],"apc_list":null,"apc_paid":null,"fwci":3.2807,"has_fulltext":true,"cited_by_count":19,"citation_normalized_percentile":{"value":0.93520235,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":"1","issue":"2","first_page":"1","last_page":"26"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12761","display_name":"Data Stream Mining Techniques","score":0.991100013256073,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7434805631637573},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.742848813533783},{"id":"https://openalex.org/keywords/preprocessor","display_name":"Preprocessor","score":0.6683477759361267},{"id":"https://openalex.org/keywords/data-pre-processing","display_name":"Data pre-processing","score":0.5780923366546631},{"id":"https://openalex.org/keywords/usable","display_name":"USable","score":0.5518711805343628},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5039870142936707},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.49074041843414307},{"id":"https://openalex.org/keywords/differentiable-function","display_name":"Differentiable function","score":0.48943892121315},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.4760453701019287},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.44855743646621704},{"id":"https://openalex.org/keywords/domain-knowledge","display_name":"Domain knowledge","score":0.41406935453414917},{"id":"https://openalex.org/keywords/test-data","display_name":"Test data","score":0.41225308179855347},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.3270626664161682},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.09586817026138306}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7434805631637573},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.742848813533783},{"id":"https://openalex.org/C34736171","wikidata":"https://www.wikidata.org/wiki/Q918333","display_name":"Preprocessor","level":2,"score":0.6683477759361267},{"id":"https://openalex.org/C10551718","wikidata":"https://www.wikidata.org/wiki/Q5227332","display_name":"Data pre-processing","level":2,"score":0.5780923366546631},{"id":"https://openalex.org/C2780615836","wikidata":"https://www.wikidata.org/wiki/Q2471869","display_name":"USable","level":2,"score":0.5518711805343628},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5039870142936707},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.49074041843414307},{"id":"https://openalex.org/C202615002","wikidata":"https://www.wikidata.org/wiki/Q783507","display_name":"Differentiable function","level":2,"score":0.48943892121315},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.4760453701019287},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.44855743646621704},{"id":"https://openalex.org/C207685749","wikidata":"https://www.wikidata.org/wiki/Q2088941","display_name":"Domain knowledge","level":2,"score":0.41406935453414917},{"id":"https://openalex.org/C16910744","wikidata":"https://www.wikidata.org/wiki/Q7705759","display_name":"Test data","level":2,"score":0.41225308179855347},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3270626664161682},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.09586817026138306},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3589328","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3589328","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3589328","source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2308.10915","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2308.10915","pdf_url":"https://arxiv.org/pdf/2308.10915","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3589328","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3589328","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3589328","source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320309321","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4381328625.pdf","grobid_xml":"https://content.openalex.org/works/W4381328625.grobid-xml"},"referenced_works_count":42,"referenced_works":["https://openalex.org/W575847903","https://openalex.org/W1665115054","https://openalex.org/W1876967670","https://openalex.org/W1990283121","https://openalex.org/W2003447360","https://openalex.org/W2088252378","https://openalex.org/W2101234009","https://openalex.org/W2102539288","https://openalex.org/W2111547563","https://openalex.org/W2132862423","https://openalex.org/W2137479650","https://openalex.org/W2182361439","https://openalex.org/W2344786740","https://openalex.org/W2437617937","https://openalex.org/W2544486974","https://openalex.org/W2548122763","https://openalex.org/W2575143348","https://openalex.org/W2606014079","https://openalex.org/W2767280887","https://openalex.org/W2899771611","https://openalex.org/W2914436728","https://openalex.org/W2941345940","https://openalex.org/W2947123069","https://openalex.org/W2948742859","https://openalex.org/W2951104886","https://openalex.org/W2953384591","https://openalex.org/W2966284335","https://openalex.org/W2971681342","https://openalex.org/W2995480811","https://openalex.org/W2997591727","https://openalex.org/W3021459486","https://openalex.org/W3103177583","https://openalex.org/W3157467026","https://openalex.org/W4214680115","https://openalex.org/W4226468895","https://openalex.org/W4280494175","https://openalex.org/W4281845057","https://openalex.org/W4288365786","https://openalex.org/W4295292688","https://openalex.org/W4297803580","https://openalex.org/W4302308043","https://openalex.org/W4380928255"],"related_works":["https://openalex.org/W2982321410","https://openalex.org/W2392004567","https://openalex.org/W2046296964","https://openalex.org/W4285277090","https://openalex.org/W2940029036","https://openalex.org/W4388292429","https://openalex.org/W2756595502","https://openalex.org/W4327738859","https://openalex.org/W2010789764","https://openalex.org/W4393273072"],"abstract_inverted_index":{"Data":[0],"preprocessing":[1,59,99,127],"is":[2,119],"a":[3,16,53,88,97,102,107,131,150],"crucial":[4],"step":[5],"in":[6],"the":[7,33,63,77,113,116,123,144,160,168,179,187,194],"machine":[8,40],"learning":[9,41],"process":[10],"that":[11,90,112,176],"transforms":[12],"raw":[13],"data":[14,47,58,98,126],"into":[15,149],"more":[17],"usable":[18],"format":[19],"for":[20,96,101],"downstream":[21],"ML":[22,78,109,117,169],"models.":[23],"However,":[24,49],"it":[25],"can":[26,91],"be":[27],"costly":[28],"and":[29,67,93,106,142,152,192],"time-consuming,":[30],"often":[31,51,70],"requiring":[32],"expertise":[34],"of":[35,57,115,125,186],"domain":[36],"experts.":[37],"Existing":[38],"automated":[39],"(AutoML)":[42],"frameworks":[43],"claim":[44],"to":[45,158,200],"automate":[46],"preprocessing.":[48],"they":[50,68,74],"use":[52],"restricted":[54],"search":[55,95,129,147,162],"space":[56,148],"pipelines":[60],"which":[61,155],"limits":[62],"potential":[64],"performance":[65,114],"gains,":[66],"are":[69],"too":[71],"slow":[72],"as":[73,130],"require":[75],"training":[76,167],"model":[79,110,118,170],"multiple":[80],"times.":[81],"In":[82],"this":[83,137],"paper,":[84],"we":[85,140],"propose":[86],"DiffPrep,":[87],"method":[89],"automatically":[92],"efficiently":[94],"pipeline":[100,128,161],"given":[103],"tabular":[104],"dataset":[105],"differentiable":[108,153],"such":[111],"maximized.":[120],"We":[121],"formalize":[122],"problem":[124,138],"bi-level":[132],"optimization":[133],"problem.":[134],"To":[135],"solve":[136],"efficiently,":[139],"transform":[141],"relax":[143],"discrete,":[145],"non-differential":[146],"continuous":[151],"one,":[154],"allows":[156],"us":[157],"perform":[159],"using":[163],"gradient":[164],"descent":[165],"with":[166],"only":[171],"once.":[172],"Our":[173],"experiments":[174],"show":[175],"DiffPrep":[177],"achieves":[178],"best":[180],"test":[181,196],"accuracy":[182,197],"on":[183],"15":[184],"out":[185],"18":[188],"real-world":[189],"datasets":[190],"evaluated":[191],"improves":[193],"model's":[195],"by":[198],"up":[199],"6.6":[201],"percentage":[202],"points.":[203]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":13},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":2}],"updated_date":"2026-04-11T08:14:18.477133","created_date":"2023-06-21T00:00:00"}
