{"id":"https://openalex.org/W4390547705","doi":"https://doi.org/10.1186/s40537-023-00857-7","title":"Impact of random oversampling and random undersampling on the performance of prediction models developed using observational health data","display_name":"Impact of random oversampling and random undersampling on the performance of prediction models developed using observational health data","publication_year":2024,"publication_date":"2024-01-03","ids":{"openalex":"https://openalex.org/W4390547705","doi":"https://doi.org/10.1186/s40537-023-00857-7"},"language":"en","primary_location":{"id":"doi:10.1186/s40537-023-00857-7","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s40537-023-00857-7","pdf_url":"https://journalofbigdata.springeropen.com/counter/pdf/10.1186/s40537-023-00857-7","source":{"id":"https://openalex.org/S2737955091","display_name":"Journal Of Big Data","issn_l":"2196-1115","issn":["2196-1115"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Big Data","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://journalofbigdata.springeropen.com/counter/pdf/10.1186/s40537-023-00857-7","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5090242240","display_name":"Cynthia Yang","orcid":"https://orcid.org/0000-0001-6769-3153"},"institutions":[{"id":"https://openalex.org/I2801952686","display_name":"Erasmus MC","ror":"https://ror.org/018906e22","country_code":"NL","type":"funder","lineage":["https://openalex.org/I2801952686"]}],"countries":["NL"],"is_corresponding":true,"raw_author_name":"Cynthia Yang","raw_affiliation_strings":["Department of Medical Informatics, Erasmus University Medical Center, Dr. Molewaterplein 40, 3015 GD, Rotterdam, The Netherlands"],"raw_orcid":"https://orcid.org/0000-0001-6769-3153","affiliations":[{"raw_affiliation_string":"Department of Medical Informatics, Erasmus University Medical Center, Dr. Molewaterplein 40, 3015 GD, Rotterdam, The Netherlands","institution_ids":["https://openalex.org/I2801952686"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080366536","display_name":"Egill A. Fri\u00f0geirsson","orcid":"https://orcid.org/0000-0001-9085-9369"},"institutions":[{"id":"https://openalex.org/I2801952686","display_name":"Erasmus MC","ror":"https://ror.org/018906e22","country_code":"NL","type":"funder","lineage":["https://openalex.org/I2801952686"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Egill A. Fridgeirsson","raw_affiliation_strings":["Department of Medical Informatics, Erasmus University Medical Center, Dr. Molewaterplein 40, 3015 GD, Rotterdam, The Netherlands"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Medical Informatics, Erasmus University Medical Center, Dr. Molewaterplein 40, 3015 GD, Rotterdam, The Netherlands","institution_ids":["https://openalex.org/I2801952686"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005077862","display_name":"Jan A. Kors","orcid":"https://orcid.org/0000-0002-4929-026X"},"institutions":[{"id":"https://openalex.org/I2801952686","display_name":"Erasmus MC","ror":"https://ror.org/018906e22","country_code":"NL","type":"funder","lineage":["https://openalex.org/I2801952686"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Jan A. Kors","raw_affiliation_strings":["Department of Medical Informatics, Erasmus University Medical Center, Dr. Molewaterplein 40, 3015 GD, Rotterdam, The Netherlands"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Medical Informatics, Erasmus University Medical Center, Dr. Molewaterplein 40, 3015 GD, Rotterdam, The Netherlands","institution_ids":["https://openalex.org/I2801952686"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031506291","display_name":"Jenna Reps","orcid":"https://orcid.org/0000-0002-2970-0778"},"institutions":[{"id":"https://openalex.org/I4210151386","display_name":"Janssen (United States)","ror":"https://ror.org/05af73403","country_code":"US","type":"company","lineage":["https://openalex.org/I1330063522","https://openalex.org/I4210151386"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jenna M. Reps","raw_affiliation_strings":["Observational Health Data Analytics, Janssen Research and Development, Titusville, NJ, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Observational Health Data Analytics, Janssen Research and Development, Titusville, NJ, USA","institution_ids":["https://openalex.org/I4210151386"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5048645102","display_name":"Peter R. Rijnbeek","orcid":"https://orcid.org/0000-0003-0621-1979"},"institutions":[{"id":"https://openalex.org/I2801952686","display_name":"Erasmus MC","ror":"https://ror.org/018906e22","country_code":"NL","type":"funder","lineage":["https://openalex.org/I2801952686"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Peter R. Rijnbeek","raw_affiliation_strings":["Department of Medical Informatics, Erasmus University Medical Center, Dr. Molewaterplein 40, 3015 GD, Rotterdam, The Netherlands"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Medical Informatics, Erasmus University Medical Center, Dr. Molewaterplein 40, 3015 GD, Rotterdam, The Netherlands","institution_ids":["https://openalex.org/I2801952686"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5090242240"],"corresponding_institution_ids":["https://openalex.org/I2801952686"],"apc_list":{"value":1060,"currency":"GBP","value_usd":1300},"apc_paid":{"value":1060,"currency":"GBP","value_usd":1300},"fwci":22.633,"has_fulltext":true,"cited_by_count":72,"citation_normalized_percentile":{"value":0.99610133,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"11","issue":"1","first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11652","display_name":"Imbalanced Data Classification Techniques","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11652","display_name":"Imbalanced Data Classification Techniques","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.9940999746322632,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11396","display_name":"Artificial Intelligence in Healthcare","score":0.9861999750137329,"subfield":{"id":"https://openalex.org/subfields/3605","display_name":"Health Information Management"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/undersampling","display_name":"Undersampling","score":0.9902304410934448},{"id":"https://openalex.org/keywords/oversampling","display_name":"Oversampling","score":0.8822218179702759},{"id":"https://openalex.org/keywords/random-forest","display_name":"Random forest","score":0.8362064957618713},{"id":"https://openalex.org/keywords/observational-study","display_name":"Observational study","score":0.7020267844200134},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5631424188613892},{"id":"https://openalex.org/keywords/calibration","display_name":"Calibration","score":0.549859881401062},{"id":"https://openalex.org/keywords/logistic-regression","display_name":"Logistic regression","score":0.5466972589492798},{"id":"https://openalex.org/keywords/receiver-operating-characteristic","display_name":"Receiver operating characteristic","score":0.5317163467407227},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.5178144574165344},{"id":"https://openalex.org/keywords/random-effects-model","display_name":"Random effects model","score":0.4620394706726074},{"id":"https://openalex.org/keywords/bootstrapping","display_name":"Bootstrapping (finance)","score":0.45798125863075256},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4502059519290924},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.42727023363113403},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.33435699343681335},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.30432045459747314},{"id":"https://openalex.org/keywords/econometrics","display_name":"Econometrics","score":0.2687515616416931},{"id":"https://openalex.org/keywords/medicine","display_name":"Medicine","score":0.21564531326293945},{"id":"https://openalex.org/keywords/meta-analysis","display_name":"Meta-analysis","score":0.10608384013175964},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.1051253080368042},{"id":"https://openalex.org/keywords/internal-medicine","display_name":"Internal medicine","score":0.07237952947616577}],"concepts":[{"id":"https://openalex.org/C136536468","wikidata":"https://www.wikidata.org/wiki/Q1225894","display_name":"Undersampling","level":2,"score":0.9902304410934448},{"id":"https://openalex.org/C197323446","wikidata":"https://www.wikidata.org/wiki/Q331222","display_name":"Oversampling","level":3,"score":0.8822218179702759},{"id":"https://openalex.org/C169258074","wikidata":"https://www.wikidata.org/wiki/Q245748","display_name":"Random forest","level":2,"score":0.8362064957618713},{"id":"https://openalex.org/C23131810","wikidata":"https://www.wikidata.org/wiki/Q818574","display_name":"Observational study","level":2,"score":0.7020267844200134},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5631424188613892},{"id":"https://openalex.org/C165838908","wikidata":"https://www.wikidata.org/wiki/Q736777","display_name":"Calibration","level":2,"score":0.549859881401062},{"id":"https://openalex.org/C151956035","wikidata":"https://www.wikidata.org/wiki/Q1132755","display_name":"Logistic regression","level":2,"score":0.5466972589492798},{"id":"https://openalex.org/C58471807","wikidata":"https://www.wikidata.org/wiki/Q327120","display_name":"Receiver operating characteristic","level":2,"score":0.5317163467407227},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.5178144574165344},{"id":"https://openalex.org/C168743327","wikidata":"https://www.wikidata.org/wiki/Q1826427","display_name":"Random effects model","level":3,"score":0.4620394706726074},{"id":"https://openalex.org/C207609745","wikidata":"https://www.wikidata.org/wiki/Q4944086","display_name":"Bootstrapping (finance)","level":2,"score":0.45798125863075256},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4502059519290924},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.42727023363113403},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.33435699343681335},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.30432045459747314},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.2687515616416931},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.21564531326293945},{"id":"https://openalex.org/C95190672","wikidata":"https://www.wikidata.org/wiki/Q815382","display_name":"Meta-analysis","level":2,"score":0.10608384013175964},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.1051253080368042},{"id":"https://openalex.org/C126322002","wikidata":"https://www.wikidata.org/wiki/Q11180","display_name":"Internal medicine","level":1,"score":0.07237952947616577},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1186/s40537-023-00857-7","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s40537-023-00857-7","pdf_url":"https://journalofbigdata.springeropen.com/counter/pdf/10.1186/s40537-023-00857-7","source":{"id":"https://openalex.org/S2737955091","display_name":"Journal Of Big Data","issn_l":"2196-1115","issn":["2196-1115"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Big Data","raw_type":"journal-article"},{"id":"pmh:oai:pure.eur.nl:openaire_cris_publications/8ae3aeda-06b2-45f1-b0f7-830be9eea970","is_oa":true,"landing_page_url":"https://pure.eur.nl/en/publications/8ae3aeda-06b2-45f1-b0f7-830be9eea970","pdf_url":"https://pure.eur.nl/ws/files/125043020/Impact_of_random_oversampling_and_random_undersampling_on_the_performance_of_prediction_models_developed_using_observational_health_data.pdf","source":{"id":"https://openalex.org/S4306401266","display_name":"EUR Research Repository (Erasmus University Rotterdam)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I913958620","host_organization_name":"Erasmus University Rotterdam","host_organization_lineage":["https://openalex.org/I913958620"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Yang, C, Fridgeirsson, E A, Kors, J A, Reps, J M & Rijnbeek, P R 2024, 'Impact of random oversampling and random undersampling on the performance of prediction models developed using observational health data', Journal of Big Data, vol. 11, no. 1, 7. https://doi.org/10.1186/s40537-023-00857-7","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:doaj.org/article:16e1370b11874d648d085a5e82d580c9","is_oa":true,"landing_page_url":"https://doaj.org/article/16e1370b11874d648d085a5e82d580c9","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Journal of Big Data, Vol 11, Iss 1, Pp 1-17 (2024)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1186/s40537-023-00857-7","is_oa":true,"landing_page_url":"https://doi.org/10.1186/s40537-023-00857-7","pdf_url":"https://journalofbigdata.springeropen.com/counter/pdf/10.1186/s40537-023-00857-7","source":{"id":"https://openalex.org/S2737955091","display_name":"Journal Of Big Data","issn_l":"2196-1115","issn":["2196-1115"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of Big Data","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.4699999988079071,"id":"https://metadata.un.org/sdg/10"},{"display_name":"Peace, Justice and strong institutions","score":0.4699999988079071,"id":"https://metadata.un.org/sdg/16"}],"awards":[{"id":"https://openalex.org/G8692998141","display_name":null,"funder_award_id":"806968","funder_id":"https://openalex.org/F4320314178","funder_display_name":"European Federation of Pharmaceutical Industries and Associations"},{"id":"https://openalex.org/G8720798102","display_name":"European Health Data and Evidence Network","funder_award_id":"806968","funder_id":"https://openalex.org/F4320320300","funder_display_name":"European Commission"}],"funders":[{"id":"https://openalex.org/F4320314178","display_name":"European Federation of Pharmaceutical Industries and Associations","ror":"https://ror.org/00g1x4v36"},{"id":"https://openalex.org/F4320320300","display_name":"European Commission","ror":"https://ror.org/00k4n6c32"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4390547705.pdf"},"referenced_works_count":26,"referenced_works":["https://openalex.org/W34678172","https://openalex.org/W2024623357","https://openalex.org/W2042571564","https://openalex.org/W2118978333","https://openalex.org/W2122892545","https://openalex.org/W2137195266","https://openalex.org/W2148143831","https://openalex.org/W2225109326","https://openalex.org/W2295598076","https://openalex.org/W2799695199","https://openalex.org/W2899434936","https://openalex.org/W2908465383","https://openalex.org/W2938932803","https://openalex.org/W2955258620","https://openalex.org/W2968847082","https://openalex.org/W2996480032","https://openalex.org/W3021150726","https://openalex.org/W3160709615","https://openalex.org/W3198379981","https://openalex.org/W4200373403","https://openalex.org/W4205328764","https://openalex.org/W4280528557","https://openalex.org/W4281717669","https://openalex.org/W4294541781","https://openalex.org/W6675354045","https://openalex.org/W6723278492"],"related_works":["https://openalex.org/W4308469503","https://openalex.org/W32988189","https://openalex.org/W2904737874","https://openalex.org/W4389233021","https://openalex.org/W2399571531","https://openalex.org/W80466363","https://openalex.org/W2947132063","https://openalex.org/W4288337828","https://openalex.org/W4390415670","https://openalex.org/W4287816717"],"abstract_inverted_index":{"Abstract":[0],"Background":[1],"There":[2],"is":[3],"currently":[4],"no":[5],"consensus":[6],"on":[7,14,40,105,217],"the":[8,15,26,41,97,103,118,121,177,180,184,201],"impact":[9,27,104],"of":[10,17,28,47,66,72,110,142,207],"class":[11,37],"imbalance":[12,38,99,181],"methods":[13],"performance":[16,46,107,206],"clinical":[18],"prediction":[19,48,61,144,208,232],"models.":[20,145],"We":[21,56,84,101,135],"aimed":[22],"to":[23],"empirically":[24],"investigate":[25],"random":[29,32,92,151,154,192,195,225,228],"oversampling":[30,152,193,226],"and":[31,43,58,95,112,127,137,148,153,203],"undersampling,":[33],"two":[34],"commonly":[35],"used":[36,85],"methods,":[39],"internal":[42,147,202],"external":[44,149,204],"validation":[45,205],"models":[49,62,178,209,233],"developed":[50,57,136,210],"using":[51,117,131],"observational":[52,81,213,236],"health":[53,82,214,237],"data.":[54],"Methods":[55],"externally":[59,138],"validated":[60,139],"for":[63],"various":[64],"outcomes":[65],"interest":[67],"within":[68],"a":[69,140],"target":[70,98],"population":[71],"people":[73],"with":[74],"pharmaceutically":[75],"treated":[76],"depression":[77],"across":[78],"four":[79],"large":[80,212,235],"databases.":[83,215,238],"three":[86],"different":[87],"classifiers":[88],"(lasso":[89],"logistic":[90],"regression,":[91],"forest,":[93],"XGBoost)":[94],"varied":[96],"ratio.":[100],"evaluated":[102],"model":[106],"in":[108,160,183,211,234],"terms":[109],"discrimination":[111],"calibration.":[113],"Discrimination":[114],"was":[115,129],"assessed":[116,130],"area":[119],"under":[120],"receiver":[122],"operating":[123],"characteristic":[124],"curve":[125],"(AUROC)":[126],"calibration":[128,132],"plots.":[133],"Results":[134],"total":[141],"1,566":[143],"On":[146],"validation,":[150],"undersampling":[155,196,229],"generally":[156,197],"did":[157],"not":[158,199,222],"result":[159],"higher":[161],"AUROCs.":[162],"Moreover,":[163],"we":[164,189,220],"found":[165,190],"overestimated":[166],"risks,":[167],"although":[168],"this":[169],"miscalibration":[170],"could":[171],"largely":[172],"be":[173],"corrected":[174],"by":[175],"recalibrating":[176],"towards":[179],"ratios":[182],"original":[185],"dataset.":[186],"Conclusions":[187],"Overall,":[188],"that":[191],"or":[194,227],"does":[198],"improve":[200],"Based":[216],"our":[218],"findings,":[219],"do":[221],"recommend":[223],"applying":[224],"when":[230],"developing":[231]},"counts_by_year":[{"year":2026,"cited_by_count":15},{"year":2025,"cited_by_count":41},{"year":2024,"cited_by_count":16}],"updated_date":"2026-05-24T08:33:08.758527","created_date":"2025-10-10T00:00:00"}
