{"id":"https://openalex.org/W1993220166","doi":"https://doi.org/10.1145/1007730.1007735","title":"A study of the behavior of several methods for balancing machine learning training data","display_name":"A study of the behavior of several methods for balancing machine learning training data","publication_year":2004,"publication_date":"2004-06-01","ids":{"openalex":"https://openalex.org/W1993220166","doi":"https://doi.org/10.1145/1007730.1007735","mag":"1993220166"},"language":"en","primary_location":{"id":"doi:10.1145/1007730.1007735","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1007730.1007735","pdf_url":null,"source":{"id":"https://openalex.org/S4210176598","display_name":"ACM SIGKDD Explorations Newsletter","issn_l":"1931-0145","issn":["1931-0145","1931-0153"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM SIGKDD Explorations Newsletter","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102371518","display_name":"Gustavo E. A. P. A. Batista","orcid":null},"institutions":[{"id":"https://openalex.org/I4210131883","display_name":"Brazilian Society of Computational and Applied Mathematics","ror":"https://ror.org/03kcw4w74","country_code":"BR","type":"other","lineage":["https://openalex.org/I4210131883"]}],"countries":["BR"],"is_corresponding":true,"raw_author_name":"Gustavo E. A. P. A. Batista","raw_affiliation_strings":["Instituto de Ci\u00eancias Matem\u00e1ticas e de Computa\u00e7\u00e3o, S\u00e3o Carlos - SP, Brazil"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Instituto de Ci\u00eancias Matem\u00e1ticas e de Computa\u00e7\u00e3o, S\u00e3o Carlos - SP, Brazil","institution_ids":["https://openalex.org/I4210131883"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005717519","display_name":"Ronaldo C. Prati","orcid":"https://orcid.org/0000-0001-8597-4987"},"institutions":[{"id":"https://openalex.org/I4210131883","display_name":"Brazilian Society of Computational and Applied Mathematics","ror":"https://ror.org/03kcw4w74","country_code":"BR","type":"other","lineage":["https://openalex.org/I4210131883"]}],"countries":["BR"],"is_corresponding":false,"raw_author_name":"Ronaldo C. Prati","raw_affiliation_strings":["Instituto de Ci\u00eancias Matem\u00e1ticas e de Computa\u00e7\u00e3o, S\u00e3o Carlos - SP, Brazil"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Instituto de Ci\u00eancias Matem\u00e1ticas e de Computa\u00e7\u00e3o, S\u00e3o Carlos - SP, Brazil","institution_ids":["https://openalex.org/I4210131883"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5000813310","display_name":"Maria Carolina Monard","orcid":"https://orcid.org/0000-0001-6004-7407"},"institutions":[{"id":"https://openalex.org/I4210131883","display_name":"Brazilian Society of Computational and Applied Mathematics","ror":"https://ror.org/03kcw4w74","country_code":"BR","type":"other","lineage":["https://openalex.org/I4210131883"]}],"countries":["BR"],"is_corresponding":false,"raw_author_name":"Maria Carolina Monard","raw_affiliation_strings":["Instituto de Ci\u00eancias Matem\u00e1ticas e de Computa\u00e7\u00e3o, S\u00e3o Carlos - SP, Brazil"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Instituto de Ci\u00eancias Matem\u00e1ticas e de Computa\u00e7\u00e3o, S\u00e3o Carlos - SP, Brazil","institution_ids":["https://openalex.org/I4210131883"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5102371518"],"corresponding_institution_ids":["https://openalex.org/I4210131883"],"apc_list":null,"apc_paid":null,"fwci":37.4648,"has_fulltext":false,"cited_by_count":4106,"citation_normalized_percentile":{"value":0.99786673,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"6","issue":"1","first_page":"20","last_page":"29"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11652","display_name":"Imbalanced Data Classification Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11652","display_name":"Imbalanced Data Classification Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.9883999824523926,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8465416431427002},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.7231571078300476},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.7109780311584473},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6125052571296692},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.5954932570457458},{"id":"https://openalex.org/keywords/simple-random-sample","display_name":"Simple random sample","score":0.5651431083679199},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.47093668580055237},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.44805052876472473},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.44138047099113464}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8465416431427002},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.7231571078300476},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.7109780311584473},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6125052571296692},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.5954932570457458},{"id":"https://openalex.org/C20353970","wikidata":"https://www.wikidata.org/wiki/Q1056998","display_name":"Simple random sample","level":3,"score":0.5651431083679199},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.47093668580055237},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.44805052876472473},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.44138047099113464},{"id":"https://openalex.org/C2908647359","wikidata":"https://www.wikidata.org/wiki/Q2625603","display_name":"Population","level":2,"score":0.0},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.0},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.0},{"id":"https://openalex.org/C149923435","wikidata":"https://www.wikidata.org/wiki/Q37732","display_name":"Demography","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/1007730.1007735","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1007730.1007735","pdf_url":null,"source":{"id":"https://openalex.org/S4210176598","display_name":"ACM SIGKDD Explorations Newsletter","issn_l":"1931-0145","issn":["1931-0145","1931-0153"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM SIGKDD Explorations Newsletter","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W85350352","https://openalex.org/W102369970","https://openalex.org/W142793689","https://openalex.org/W1499467920","https://openalex.org/W1504694836","https://openalex.org/W1576442155","https://openalex.org/W1591261915","https://openalex.org/W1941659294","https://openalex.org/W1977245551","https://openalex.org/W1994410331","https://openalex.org/W2004131797","https://openalex.org/W2058732827","https://openalex.org/W2084812512","https://openalex.org/W2107686700","https://openalex.org/W2114968414","https://openalex.org/W2122496402","https://openalex.org/W2125055259","https://openalex.org/W2136903812","https://openalex.org/W2137029138","https://openalex.org/W2147169507","https://openalex.org/W2148143831","https://openalex.org/W2152761983","https://openalex.org/W2157092487","https://openalex.org/W2170913656","https://openalex.org/W2797114046","https://openalex.org/W4285719527"],"related_works":["https://openalex.org/W2370014976","https://openalex.org/W2350399852","https://openalex.org/W3047864323","https://openalex.org/W1036777753","https://openalex.org/W4379210352","https://openalex.org/W4238714840","https://openalex.org/W4362679606","https://openalex.org/W2994963386","https://openalex.org/W3004700962","https://openalex.org/W2088812990"],"abstract_inverted_index":{"There":[0],"are":[1,283],"several":[2],"aspects":[3,22],"that":[4,18,112,280],"might":[5],"influence":[6],"the":[7,40,43,62,70,74,94,99,119,126,141,194,197,210,255,266,270,288,298,302,312,316,326],"performance":[8,120,261],"achieved":[9],"by":[10,93],"existing":[11],"learning":[12,63,122,133],"systems.":[13,123],"It":[14],"has":[15],"been":[16],"reported":[17],"one":[19,36],"of":[20,90,121,143,152,213,235,269,305,319],"these":[21,158,281],"is":[23,50,246],"related":[24,72,131],"to":[25,35,68,73,96,129,132,172,204,249],"class":[26,37,100,113,138,149,175],"imbalance":[27,101,114],"in":[28,31,42,52,103,140,170,182,209,301,315],"which":[29,49],"examples":[30,41,139],"training":[32],"data":[33,55,106,167,229],"belonging":[34],"heavily":[38],"outnumber":[39],"other":[44,144],"class.":[45,76],"In":[46,77,124],"this":[47,78],"situation,":[48],"found":[51],"real":[53],"world":[54],"describing":[56],"an":[57],"infrequent":[58],"but":[59],"important":[60],"event,":[61],"system":[64],"may":[65],"have":[66],"difficulties":[67],"learn":[69],"concept":[71],"minority":[75,137],"work":[79],"we":[80,263],"perform":[81],"a":[82,162,232,241],"broad":[83],"experimental":[84],"evaluation":[85],"involving":[86],"ten":[87],"methods,":[88,216],"three":[89],"them":[91],"proposed":[92,154,215],"authors,":[95],"deal":[97,156],"with":[98,134,157,166,231],"problem":[102,127],"thirteen":[104],"UCI":[105],"sets.":[107],"Our":[108,177,277],"experiments":[109,179],"provide":[110,186],"evidence":[111],"does":[115],"not":[116],"systematically":[117],"hinder":[118],"fact,":[125],"seems":[128,203],"be":[130],"too":[135],"few":[136],"presence":[142],"complicating":[145],"factors,":[146],"such":[147],"as":[148],"overlapping.":[150],"Two":[151,212],"our":[153,214],"methods":[155,169,185,192,257],"conditions":[159,320],"directly,":[160],"allying":[161],"known":[163],"over-sampling":[164,184,244,252,256,295,328],"method":[165],"cleaning":[168],"order":[171],"produce":[173],"better-defined":[174],"clusters.":[176],"comparative":[178],"show":[180,279],"that,":[181],"general,":[183],"more":[187,250,285],"accurate":[188],"results":[189,206,227,278],"than":[190],"under-sampling":[191],"considering":[193],"area":[195],"under":[196],"ROC":[198],"curve":[199],"(AUC).":[200],"This":[201],"result":[202],"contradict":[205],"previously":[207],"published":[208],"literature.":[211],"Smote":[217,221,309],"+":[218,222,310],"Tomek":[219],"and":[220,308],"ENN,":[223],"presented":[224],"very":[225,242,247,259],"good":[226,260],"for":[228],"sets":[230],"small":[233],"number":[234,304,318],"positive":[236],"examples.":[237],"Moreover,":[238],"Random":[239,294],"over-sampling,":[240],"simple":[243],"method,":[245],"competitive":[248],"complex":[251,286],"methods.":[253,329],"Since":[254],"provided":[258],"results,":[262],"also":[264],"measured":[265],"syntactic":[267],"complexity":[268],"decision":[271],"trees":[272,282],"induced":[273,290,306],"from":[274,291],"over-sampled":[275],"data.":[276,293],"usually":[284,296],"then":[287],"ones":[289],"original":[292],"produced":[297],"smallest":[299,313],"increase":[300,314],"mean":[303,317],"rules":[307],"ENN":[311],"per":[321],"rule,":[322],"when":[323],"compared":[324],"among":[325],"investigated":[327]},"counts_by_year":[{"year":2026,"cited_by_count":140},{"year":2025,"cited_by_count":463},{"year":2024,"cited_by_count":437},{"year":2023,"cited_by_count":422},{"year":2022,"cited_by_count":418},{"year":2021,"cited_by_count":366},{"year":2020,"cited_by_count":372},{"year":2019,"cited_by_count":282},{"year":2018,"cited_by_count":198},{"year":2017,"cited_by_count":148},{"year":2016,"cited_by_count":128},{"year":2015,"cited_by_count":102},{"year":2014,"cited_by_count":100},{"year":2013,"cited_by_count":103},{"year":2012,"cited_by_count":86}],"updated_date":"2026-06-05T09:01:59.212387","created_date":"2025-10-10T00:00:00"}
