{"id":"https://openalex.org/W3142848995","doi":"https://doi.org/10.1007/s00180-022-01207-6","title":"Regularized target encoding outperforms traditional methods in supervised machine learning with high cardinality features","display_name":"Regularized target encoding outperforms traditional methods in supervised machine learning with high cardinality features","publication_year":2022,"publication_date":"2022-03-04","ids":{"openalex":"https://openalex.org/W3142848995","doi":"https://doi.org/10.1007/s00180-022-01207-6","mag":"3142848995"},"language":"en","primary_location":{"id":"doi:10.1007/s00180-022-01207-6","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00180-022-01207-6","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00180-022-01207-6.pdf","source":{"id":"https://openalex.org/S8500805","display_name":"Computational Statistics","issn_l":"0943-4062","issn":["0943-4062","1613-9658"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computational Statistics","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://link.springer.com/content/pdf/10.1007/s00180-022-01207-6.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5039217538","display_name":"Florian Pargent","orcid":"https://orcid.org/0000-0002-2388-553X"},"institutions":[{"id":"https://openalex.org/I8204097","display_name":"Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen","ror":"https://ror.org/05591te55","country_code":"DE","type":"education","lineage":["https://openalex.org/I8204097"]},{"id":"https://openalex.org/I897549280","display_name":"Hologic (Germany)","ror":"https://ror.org/007zn4n28","country_code":"DE","type":"company","lineage":["https://openalex.org/I4210117952","https://openalex.org/I897549280"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Florian Pargent","raw_affiliation_strings":["Department of Psychology, Psychological Methods and Assessment, LMU Munich, Leopoldstra\u00dfe 13, 80802, Munich, Germany"],"raw_orcid":"https://orcid.org/0000-0002-2388-553X","affiliations":[{"raw_affiliation_string":"Department of Psychology, Psychological Methods and Assessment, LMU Munich, Leopoldstra\u00dfe 13, 80802, Munich, Germany","institution_ids":["https://openalex.org/I897549280","https://openalex.org/I8204097"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019209786","display_name":"Florian Pfisterer","orcid":"https://orcid.org/0000-0001-8867-762X"},"institutions":[{"id":"https://openalex.org/I8204097","display_name":"Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen","ror":"https://ror.org/05591te55","country_code":"DE","type":"education","lineage":["https://openalex.org/I8204097"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Florian Pfisterer","raw_affiliation_strings":["Department of Statistics, Statistical Learning and Data Science, LMU Munich, Ludwigstra\u00dfe 33, 80539, Munich, Germany"],"raw_orcid":"https://orcid.org/0000-0001-8867-762X","affiliations":[{"raw_affiliation_string":"Department of Statistics, Statistical Learning and Data Science, LMU Munich, Ludwigstra\u00dfe 33, 80539, Munich, Germany","institution_ids":["https://openalex.org/I8204097"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032315622","display_name":"Janek Thomas","orcid":"https://orcid.org/0000-0003-4511-6245"},"institutions":[{"id":"https://openalex.org/I8204097","display_name":"Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen","ror":"https://ror.org/05591te55","country_code":"DE","type":"education","lineage":["https://openalex.org/I8204097"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Janek Thomas","raw_affiliation_strings":["Department of Statistics, Statistical Learning and Data Science, LMU Munich, Ludwigstra\u00dfe 33, 80539, Munich, Germany"],"raw_orcid":"https://orcid.org/0000-0003-4511-6245","affiliations":[{"raw_affiliation_string":"Department of Statistics, Statistical Learning and Data Science, LMU Munich, Ludwigstra\u00dfe 33, 80539, Munich, Germany","institution_ids":["https://openalex.org/I8204097"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5069072700","display_name":"Bernd Bischl","orcid":"https://orcid.org/0000-0001-6002-6980"},"institutions":[{"id":"https://openalex.org/I8204097","display_name":"Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen","ror":"https://ror.org/05591te55","country_code":"DE","type":"education","lineage":["https://openalex.org/I8204097"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Bernd Bischl","raw_affiliation_strings":["Department of Statistics, Statistical Learning and Data Science, LMU Munich, Ludwigstra\u00dfe 33, 80539, Munich, Germany"],"raw_orcid":"https://orcid.org/0000-0001-6002-6980","affiliations":[{"raw_affiliation_string":"Department of Statistics, Statistical Learning and Data Science, LMU Munich, Ludwigstra\u00dfe 33, 80539, Munich, Germany","institution_ids":["https://openalex.org/I8204097"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5039217538"],"corresponding_institution_ids":["https://openalex.org/I8204097","https://openalex.org/I897549280"],"apc_list":{"value":2490,"currency":"EUR","value_usd":3090},"apc_paid":{"value":2490,"currency":"EUR","value_usd":3090},"fwci":16.5131,"has_fulltext":true,"cited_by_count":149,"citation_normalized_percentile":{"value":0.99336346,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":"37","issue":"5","first_page":"2671","last_page":"2692"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10057","display_name":"Face and Expression Recognition","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10862","display_name":"AI in cancer detection","score":0.9912999868392944,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cardinality","display_name":"Cardinality (data modeling)","score":0.8086339235305786},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6883902549743652},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.6780092716217041},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6326796412467957},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.5647062063217163},{"id":"https://openalex.org/keywords/supervised-learning","display_name":"Supervised learning","score":0.45792680978775024},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4468056261539459},{"id":"https://openalex.org/keywords/semi-supervised-learning","display_name":"Semi-supervised learning","score":0.41466179490089417},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.22906669974327087},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.11039945483207703}],"concepts":[{"id":"https://openalex.org/C87117476","wikidata":"https://www.wikidata.org/wiki/Q362383","display_name":"Cardinality (data modeling)","level":2,"score":0.8086339235305786},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6883902549743652},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.6780092716217041},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6326796412467957},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.5647062063217163},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.45792680978775024},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4468056261539459},{"id":"https://openalex.org/C58973888","wikidata":"https://www.wikidata.org/wiki/Q1041418","display_name":"Semi-supervised learning","level":2,"score":0.41466179490089417},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.22906669974327087},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.11039945483207703}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1007/s00180-022-01207-6","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00180-022-01207-6","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00180-022-01207-6.pdf","source":{"id":"https://openalex.org/S8500805","display_name":"Computational Statistics","issn_l":"0943-4062","issn":["0943-4062","1613-9658"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computational Statistics","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2104.00629","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2104.00629","pdf_url":"https://arxiv.org/pdf/2104.00629","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:RePEc:spr:compst:v:37:y:2022:i:5:d:10.1007_s00180-022-01207-6","is_oa":false,"landing_page_url":"http://link.springer.com/10.1007/s00180-022-01207-6","pdf_url":null,"source":{"id":"https://openalex.org/S4306401271","display_name":"RePEc: Research Papers in Economics","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I77793887","host_organization_name":"Federal Reserve Bank of St. Louis","host_organization_lineage":["https://openalex.org/I77793887"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},{"id":"pmh:oai:epub.ub.uni-muenchen.de:106778","is_oa":true,"landing_page_url":"http://nbn-resolving.de/urn:nbn:de:bvb:19-epub-106778-5","pdf_url":null,"source":{"id":"https://openalex.org/S4306401845","display_name":"Open access LMU (Ludwid Maxmilian's Universitat Munchen)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I8204097","host_organization_name":"Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen","host_organization_lineage":["https://openalex.org/I8204097"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Computational Statistics","raw_type":"doc-type:article"}],"best_oa_location":{"id":"doi:10.1007/s00180-022-01207-6","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s00180-022-01207-6","pdf_url":"https://link.springer.com/content/pdf/10.1007/s00180-022-01207-6.pdf","source":{"id":"https://openalex.org/S8500805","display_name":"Computational Statistics","issn_l":"0943-4062","issn":["0943-4062","1613-9658"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computational Statistics","raw_type":"journal-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/15","display_name":"Life in Land","score":0.5099999904632568}],"awards":[{"id":"https://openalex.org/G3910675937","display_name":null,"funder_award_id":"01IS18036A","funder_id":"https://openalex.org/F4320323991","funder_display_name":"Bundesministerium f\u00fcr Bildung, Wissenschaft und Kultur"},{"id":"https://openalex.org/G6120799783","display_name":null,"funder_award_id":"20-3410-2-9-8","funder_id":"https://openalex.org/F4320323888","funder_display_name":"Bayerisches Staatsministerium\u00a0f\u00fcr Wirtschaft und Medien,\u00a0Energie und Technologie"}],"funders":[{"id":"https://openalex.org/F4320323888","display_name":"Bayerisches Staatsministerium\u00a0f\u00fcr Wirtschaft und Medien,\u00a0Energie und Technologie","ror":"https://ror.org/00rspzk45"},{"id":"https://openalex.org/F4320323991","display_name":"Bundesministerium f\u00fcr Bildung, Wissenschaft und Kultur","ror":"https://ror.org/03rrfep50"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3142848995.pdf","grobid_xml":"https://content.openalex.org/works/W3142848995.grobid-xml"},"referenced_works_count":48,"referenced_works":["https://openalex.org/W1532821071","https://openalex.org/W1556996173","https://openalex.org/W1896739462","https://openalex.org/W1951724000","https://openalex.org/W1981276685","https://openalex.org/W1981457167","https://openalex.org/W1989718642","https://openalex.org/W1990836268","https://openalex.org/W2070996757","https://openalex.org/W2071389326","https://openalex.org/W2095234124","https://openalex.org/W2097360283","https://openalex.org/W2102539288","https://openalex.org/W2106525823","https://openalex.org/W2116825089","https://openalex.org/W2120199131","https://openalex.org/W2132862423","https://openalex.org/W2137886649","https://openalex.org/W2156504490","https://openalex.org/W2157395790","https://openalex.org/W2168123127","https://openalex.org/W2182361439","https://openalex.org/W2408962487","https://openalex.org/W2460566717","https://openalex.org/W2559655401","https://openalex.org/W2589615383","https://openalex.org/W2770322578","https://openalex.org/W2797928606","https://openalex.org/W2799900537","https://openalex.org/W2843469511","https://openalex.org/W2914279003","https://openalex.org/W2954052001","https://openalex.org/W2963105378","https://openalex.org/W2964022491","https://openalex.org/W2965153324","https://openalex.org/W2973941913","https://openalex.org/W3020873385","https://openalex.org/W3099802519","https://openalex.org/W3102027041","https://openalex.org/W3105965470","https://openalex.org/W3159303128","https://openalex.org/W4200537922","https://openalex.org/W4243072198","https://openalex.org/W4255714526","https://openalex.org/W4294541781","https://openalex.org/W4399649002","https://openalex.org/W6600787954","https://openalex.org/W6608287133"],"related_works":["https://openalex.org/W3114793362","https://openalex.org/W1586607209","https://openalex.org/W122912556","https://openalex.org/W4312414840","https://openalex.org/W2621411691","https://openalex.org/W2513638114","https://openalex.org/W2271357838","https://openalex.org/W4390062853","https://openalex.org/W4389256085","https://openalex.org/W2518241345"],"abstract_inverted_index":{"Abstract":[0],"Since":[1],"most":[2],"machine":[3],"learning":[4],"(ML)":[5],"algorithms":[6,100],"are":[7,26],"designed":[8],"for":[9],"numerical":[10,144],"inputs,":[11],"efficiently":[12],"encoding":[13,94,127],"categorical":[14,32,49],"variables":[15,34,50,186],"is":[16],"a":[17,36,69,86,142],"crucial":[18],"aspect":[19],"in":[20,56,137,195],"data":[21],"analysis.":[22],"A":[23],"common":[24],"problem":[25],"high":[27,37],"cardinality":[28],"features,":[29],"i.e.":[30],"unordered":[31],"predictor":[33],"with":[35,97],"number":[38,171],"of":[39,48,65,125,172],"levels.":[40],"We":[41,60,84],"study":[42],"techniques":[43,67],"that":[44,155],"yield":[45],"numeric":[46],"representations":[47],"which":[51,82],"can":[52],"then":[53],"be":[54],"used":[55,153],"subsequent":[57,70],"ML":[58,99],"applications.":[59],"focus":[61],"on":[62,68,78,133,176],"the":[63,134,138,148,170],"impact":[64],"these":[66],"algorithm\u2019s":[71],"predictive":[72],"performance,":[73],"and\u2014if":[74],"possible\u2014derive":[75],"best":[76,149],"practices":[77],"when":[79],"to":[80,159,162,168],"use":[81],"technique.":[83],"conducted":[85],"large-scale":[87],"benchmark":[88],"experiment,":[89],"where":[90],"we":[91],"compared":[92],"different":[93],"strategies":[95],"together":[96],"five":[98],"(lasso,":[101],"random":[102],"forest,":[103],"gradient":[104],"boosting,":[105],"k":[106],"-nearest":[107],"neighbors,":[108],"support":[109],"vector":[110],"machine)":[111],"using":[112,129],"datasets":[113],"from":[114],"regression,":[115],"binary-":[116],"and":[117],"multiclass\u2013classification":[118],"settings.":[119],"In":[120],"our":[121],"study,":[122],"regularized":[123],"versions":[124],"target":[126,130,177],"(i.e.":[128],"predictions":[131],"based":[132,175],"feature":[135],"levels":[136,161,173],"training":[139],"set":[140],"as":[141,193],"new":[143],"feature)":[145],"consistently":[146],"provided":[147],"results.":[150],"Traditionally":[151],"widely":[152],"encodings":[154],"make":[156],"unreasonable":[157],"assumptions":[158],"map":[160],"integers":[163],"(e.g.":[164],"integer":[165],"encoding)":[166,181,190],"or":[167,188],"reduce":[169],"(possibly":[174],"information,":[178],"e.g.":[179],"leaf":[180],"before":[182],"creating":[183],"binary":[184],"indicator":[185],"(one-hot":[187],"dummy":[189],"were":[191],"not":[192],"effective":[194],"comparison.":[196]},"counts_by_year":[{"year":2026,"cited_by_count":28},{"year":2025,"cited_by_count":45},{"year":2024,"cited_by_count":43},{"year":2023,"cited_by_count":25},{"year":2022,"cited_by_count":6},{"year":2021,"cited_by_count":2}],"updated_date":"2026-06-17T08:01:34.144755","created_date":"2025-10-10T00:00:00"}
