{"id":"https://openalex.org/W7117028293","doi":"https://doi.org/10.48550/arxiv.2512.17460","title":"When Data Quality Issues Collide: A Large-Scale Empirical Study of Co-Occurring Data Quality Issues in Software Defect Prediction","display_name":"When Data Quality Issues Collide: A Large-Scale Empirical Study of Co-Occurring Data Quality Issues in Software Defect Prediction","publication_year":2025,"publication_date":"2025-12-19","ids":{"openalex":"https://openalex.org/W7117028293","doi":"https://doi.org/10.48550/arxiv.2512.17460"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2512.17460","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.17460","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2512.17460","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5060873383","display_name":"Emmanuel Charleson Dapaah","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Dapaah, Emmanuel Charleson","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5078674128","display_name":"Jens Grabowski","orcid":"https://orcid.org/0000-0003-2994-3531"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Grabowski, Jens","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5060873383"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9659000039100647,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9659000039100647,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11652","display_name":"Imbalanced Data Classification Techniques","score":0.007699999958276749,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.0035000001080334187,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/counterintuitive","display_name":"Counterintuitive","score":0.6581000089645386},{"id":"https://openalex.org/keywords/outlier","display_name":"Outlier","score":0.6162999868392944},{"id":"https://openalex.org/keywords/empirical-research","display_name":"Empirical research","score":0.5637000203132629},{"id":"https://openalex.org/keywords/software-quality","display_name":"Software quality","score":0.5608999729156494},{"id":"https://openalex.org/keywords/software-bug","display_name":"Software bug","score":0.5522000193595886},{"id":"https://openalex.org/keywords/boosting","display_name":"Boosting (machine learning)","score":0.5453000068664551},{"id":"https://openalex.org/keywords/data-quality","display_name":"Data quality","score":0.5153999924659729},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.48750001192092896},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.4772000014781952}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7455000281333923},{"id":"https://openalex.org/C101097943","wikidata":"https://www.wikidata.org/wiki/Q5176983","display_name":"Counterintuitive","level":2,"score":0.6581000089645386},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.6183000206947327},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.6172000169754028},{"id":"https://openalex.org/C79337645","wikidata":"https://www.wikidata.org/wiki/Q779824","display_name":"Outlier","level":2,"score":0.6162999868392944},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.5637000203132629},{"id":"https://openalex.org/C117447612","wikidata":"https://www.wikidata.org/wiki/Q1412670","display_name":"Software quality","level":4,"score":0.5608999729156494},{"id":"https://openalex.org/C1009929","wikidata":"https://www.wikidata.org/wiki/Q179550","display_name":"Software bug","level":3,"score":0.5522000193595886},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.5453000068664551},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.5153999924659729},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48899999260902405},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.48750001192092896},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.4772000014781952},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.4713999927043915},{"id":"https://openalex.org/C8642999","wikidata":"https://www.wikidata.org/wiki/Q4171168","display_name":"Hyperparameter","level":2,"score":0.42719998955726624},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4124999940395355},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.38179999589920044},{"id":"https://openalex.org/C739882","wikidata":"https://www.wikidata.org/wiki/Q3560506","display_name":"Anomaly detection","level":2,"score":0.37400001287460327},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3287999927997589},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.32010000944137573},{"id":"https://openalex.org/C2776969324","wikidata":"https://www.wikidata.org/wiki/Q613918","display_name":"Software quality assurance","level":5,"score":0.3082999885082245},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2955000102519989},{"id":"https://openalex.org/C97256817","wikidata":"https://www.wikidata.org/wiki/Q1462316","display_name":"Spurious relationship","level":2,"score":0.27889999747276306},{"id":"https://openalex.org/C133199616","wikidata":"https://www.wikidata.org/wiki/Q25386885","display_name":"Empirical modelling","level":2,"score":0.2655999958515167},{"id":"https://openalex.org/C188329197","wikidata":"https://www.wikidata.org/wiki/Q6554613","display_name":"Software quality analyst","level":5,"score":0.265500009059906},{"id":"https://openalex.org/C45804977","wikidata":"https://www.wikidata.org/wiki/Q7239673","display_name":"Predictive modelling","level":2,"score":0.2615000009536743},{"id":"https://openalex.org/C82214349","wikidata":"https://www.wikidata.org/wiki/Q657339","display_name":"Software metric","level":5,"score":0.25369998812675476},{"id":"https://openalex.org/C148483581","wikidata":"https://www.wikidata.org/wiki/Q446488","display_name":"Feature selection","level":2,"score":0.25220000743865967},{"id":"https://openalex.org/C175801342","wikidata":"https://www.wikidata.org/wiki/Q1988917","display_name":"Data analysis","level":2,"score":0.25209999084472656}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2512.17460","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.17460","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2512.17460","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.17460","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Software":[0],"Defect":[1],"Prediction":[2],"(SDP)":[3],"models":[4,171],"are":[5,139,188],"central":[6],"to":[7,52,96,173,233],"proactive":[8],"software":[9],"quality":[10,20,68,241],"assurance,":[11],"yet":[12],"their":[13],"effectiveness":[14],"is":[15,116,145],"often":[16],"constrained":[17],"by":[18],"the":[19,55,120,146,191],"of":[21,133,193,239],"available":[22],"datasets.":[23,134],"Prior":[24],"research":[25],"has":[26],"typically":[27],"examined":[28],"single":[29,203],"issues":[30,69,242],"such":[31,180],"as":[32,181],"class":[33,72,143,159],"imbalance":[34,138],"or":[35],"feature":[36],"irrelevance":[37],"in":[38,60,129,225,246],"isolation,":[39],"overlooking":[40],"that":[41,62,114],"real-world":[42,247],"data":[43,67],"problems":[44],"frequently":[45],"co-occur":[46],"and":[47,78,83,100,137,164,215],"interact.":[48],"This":[49],"study":[50,219],"presents,":[51],"our":[53,218],"knowledge,":[54],"first":[56],"large-scale":[57],"empirical":[58],"analysis":[59,95],"SDP":[61,226],"simultaneously":[63],"examines":[64],"five":[65,84],"co-occurring":[66],"(class":[70],"imbalance,":[71,163],"overlap,":[73,160],"irrelevant":[74,186],"features,":[75],"attribute":[76],"noise,":[77],"outliers)":[79],"across":[80],"374":[81],"datasets":[82],"classifiers.":[85],"We":[86,151,175],"employ":[87],"Explainable":[88],"Boosting":[89],"Machines":[90],"together":[91],"with":[92],"stratified":[93],"interaction":[94],"quantify":[97],"both":[98],"direct":[99],"conditional":[101,216],"effects":[102],"under":[103,206],"default":[104],"hyperparameter":[105],"settings,":[106],"reflecting":[107],"practical":[108],"baseline":[109],"usage.":[110],"Our":[111],"results":[112],"show":[113],"co-occurrence":[115],"nearly":[117,140],"universal:":[118],"even":[119],"least":[121],"frequent":[122],"issue":[123],"(attribute":[124],"noise)":[125],"appears":[126],"alongside":[127],"others":[128],"more":[130],"than":[131],"93%":[132],"Irrelevant":[135],"features":[136,187],"ubiquitous,":[141],"while":[142],"overlap":[144],"most":[147,170],"consistently":[148],"harmful":[149],"issue.":[150],"identify":[152],"stable":[153],"tipping":[154],"points":[155],"around":[156],"0.20":[157],"for":[158,162,166],"0.65-0.70":[161],"0.94":[165],"irrelevance,":[167],"beyond":[168,230],"which":[169],"begin":[172],"degrade.":[174],"also":[176],"uncover":[177],"counterintuitive":[178],"patterns,":[179],"outliers":[182],"improving":[183],"performance":[184,245],"when":[185],"low,":[189],"underscoring":[190],"importance":[192],"context-aware":[194],"evaluation.":[195],"Finally,":[196],"we":[197],"expose":[198],"a":[199,222,235],"performance-robustness":[200],"trade-off:":[201],"no":[202],"learner":[204],"dominates":[205],"all":[207],"conditions.":[208],"By":[209],"jointly":[210],"analyzing":[211],"prevalence,":[212],"co-occurrence,":[213],"thresholds,":[214],"effects,":[217],"directly":[220],"addresses":[221],"persistent":[223],"gap":[224],"research.":[227],"Hence,":[228],"moving":[229],"isolated":[231],"analyses":[232],"provide":[234],"holistic,":[236],"data-aware":[237],"understanding":[238],"how":[240],"shape":[243],"model":[244],"settings.":[248]},"counts_by_year":[],"updated_date":"2025-12-23T23:15:37.779995","created_date":"2025-12-23T00:00:00"}
