{"id":"https://openalex.org/W4415008686","doi":"https://doi.org/10.1145/3763098","title":"Understanding and Improving Flaky Test Classification","display_name":"Understanding and Improving Flaky Test Classification","publication_year":2025,"publication_date":"2025-10-09","ids":{"openalex":"https://openalex.org/W4415008686","doi":"https://doi.org/10.1145/3763098"},"language":"en","primary_location":{"id":"doi:10.1145/3763098","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3763098","pdf_url":null,"source":{"id":"https://openalex.org/S4210216081","display_name":"Proceedings of the ACM on Programming Languages","issn_l":"2475-1421","issn":["2475-1421"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Programming Languages","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1145/3763098","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102397632","display_name":"Shanto Rahman","orcid":"https://orcid.org/0000-0002-0599-8215"},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Shanto Rahman","raw_affiliation_strings":["The University of Texas at Austin, Austin, USA"],"raw_orcid":"https://orcid.org/0000-0002-0599-8215","affiliations":[{"raw_affiliation_string":"The University of Texas at Austin, Austin, USA","institution_ids":["https://openalex.org/I86519309"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5095107883","display_name":"Saikat Dutta","orcid":"https://orcid.org/0000-0001-8346-4019"},"institutions":[{"id":"https://openalex.org/I205783295","display_name":"Cornell University","ror":"https://ror.org/05bnh6r87","country_code":"US","type":"education","lineage":["https://openalex.org/I205783295"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Saikat Dutta","raw_affiliation_strings":["Cornell University, Ithaca, USA"],"raw_orcid":"https://orcid.org/0000-0001-8346-4019","affiliations":[{"raw_affiliation_string":"Cornell University, Ithaca, USA","institution_ids":["https://openalex.org/I205783295"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5081483504","display_name":"August Shi","orcid":"https://orcid.org/0000-0001-8239-3124"},"institutions":[{"id":"https://openalex.org/I86519309","display_name":"The University of Texas at Austin","ror":"https://ror.org/00hj54h04","country_code":"US","type":"education","lineage":["https://openalex.org/I86519309"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"August Shi","raw_affiliation_strings":["The University of Texas at Austin, Austin, USA"],"raw_orcid":"https://orcid.org/0000-0001-8239-3124","affiliations":[{"raw_affiliation_string":"The University of Texas at Austin, Austin, USA","institution_ids":["https://openalex.org/I86519309"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5102397632"],"corresponding_institution_ids":["https://openalex.org/I86519309"],"apc_list":null,"apc_paid":null,"fwci":5.8506,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.960193,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":"9","issue":"OOPSLA2","first_page":"1345","last_page":"1371"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10743","display_name":"Software Testing and Debugging Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.9900000095367432,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/test","display_name":"Test (biology)","score":0.5414000153541565},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4641000032424927},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.41359999775886536},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.36169999837875366},{"id":"https://openalex.org/keywords/predictive-modelling","display_name":"Predictive modelling","score":0.3240000009536743}],"concepts":[{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.635200023651123},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6032999753952026},{"id":"https://openalex.org/C2777267654","wikidata":"https://www.wikidata.org/wiki/Q3519023","display_name":"Test (biology)","level":2,"score":0.5414000153541565},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48649999499320984},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4641000032424927},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.41359999775886536},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.36169999837875366},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.34220001101493835},{"id":"https://openalex.org/C45804977","wikidata":"https://www.wikidata.org/wiki/Q7239673","display_name":"Predictive modelling","level":2,"score":0.3240000009536743},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.29739999771118164},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.29179999232292175},{"id":"https://openalex.org/C2984328558","wikidata":"https://www.wikidata.org/wiki/Q188522","display_name":"Software testing","level":3,"score":0.2581000030040741}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3763098","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3763098","pdf_url":null,"source":{"id":"https://openalex.org/S4210216081","display_name":"Proceedings of the ACM on Programming Languages","issn_l":"2475-1421","issn":["2475-1421"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Programming Languages","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3763098","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3763098","pdf_url":null,"source":{"id":"https://openalex.org/S4210216081","display_name":"Proceedings of the ACM on Programming Languages","issn_l":"2475-1421","issn":["2475-1421"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Programming Languages","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4300429361","display_name":null,"funder_award_id":"CCF-2145774,CCF-2217696","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1976954202","https://openalex.org/W2026926213","https://openalex.org/W2107500604","https://openalex.org/W2282821441","https://openalex.org/W2477215876","https://openalex.org/W2589663975","https://openalex.org/W2591823109","https://openalex.org/W2951170114","https://openalex.org/W2955940099","https://openalex.org/W2957454295","https://openalex.org/W2963023085","https://openalex.org/W2963351448","https://openalex.org/W3043238989","https://openalex.org/W3085380432","https://openalex.org/W3090144865","https://openalex.org/W3091299963","https://openalex.org/W3095312267","https://openalex.org/W3102799430","https://openalex.org/W3106532184","https://openalex.org/W3167640545","https://openalex.org/W3179011554","https://openalex.org/W3194175074","https://openalex.org/W3198806484","https://openalex.org/W4220722393","https://openalex.org/W4225823287","https://openalex.org/W4239672721","https://openalex.org/W4250616538","https://openalex.org/W4292967768","https://openalex.org/W4296872689","https://openalex.org/W4308643311","https://openalex.org/W4312264921","https://openalex.org/W4379512492","https://openalex.org/W4382317573","https://openalex.org/W4384009631","https://openalex.org/W4401905754","https://openalex.org/W4401907012","https://openalex.org/W6930880628"],"related_works":[],"abstract_inverted_index":{"Regression":[0],"testing":[1],"is":[2],"an":[3],"essential":[4],"part":[5],"of":[6,15,71,118,122,286,351],"software":[7],"development,":[8],"but":[9],"it":[10,92],"suffers":[11],"from":[12,156],"the":[13,28,69,72,84,106,119,133,146,180,188,204,236,240,245,266,273,284,299,304,316,320],"presence":[14],"flaky":[16,53,85,102,123,174,231,274,279],"tests":[17,19,54,126,300],"\u2013":[18,82],"that":[20,45,65,178,211,218,243,268,333],"pass":[21],"and":[22,37,79,115,136,201,301,344],"fail":[23],"non-deterministically":[24],"when":[25,312],"run":[26],"on":[27,203,224,272,346],"same":[29,205],"code.":[30],"These":[31,330],"unpredictable":[32],"failures":[33],"waste":[34],"developers\u2019":[35],"time":[36],"often":[38],"hide":[39],"real":[40,120],"bugs.":[41],"Prior":[42],"work":[43],"showed":[44],"fine-tuned":[46],"large":[47],"language":[48],"models":[49,73,246,335],"(LLMs)":[50],"can":[51,323],"classify":[52],"into":[55,298,360],"different":[56],"categories":[57],"with":[58],"very":[59],"high":[60],"accuracy.":[61],"However,":[62],"we":[63,97,131,143,165,234,264,290],"find":[64],"prior":[66,101,147],"approaches":[67],"over-estimated":[68],"accuracy":[70,108,322],"due":[74,109],"to":[75,110,159,171,183,338],"incorrect":[76,251],"experimental":[77,134],"design":[78,114,135],"unrealistic":[80],"datasets":[81],"making":[83,248],"test":[86,103,175,232,241,275,280],"classification":[87,181,206,321],"problem":[88],"seem":[89],"simpler":[90],"than":[91,187],"is.":[93],"In":[94],"this":[95,225],"paper,":[96],"first":[98],"show":[99,210,310],"how":[100],"classifiers":[104],"over-estimate":[105],"prediction":[107],"1)":[111],"flawed":[112],"experiment":[113],"2)":[116],"mis-representation":[117],"distribution":[121],"(and":[124],"non-flaky)":[125],"in":[127,154,239,247,261],"their":[128,341,353],"datasets.":[129],"After":[130],"fix":[132],"construct":[137],"a":[138,151,167,173],"more":[139,361],"realistic":[140],"dataset":[141],"(which":[142],"name":[144],"FlakeBench),":[145],"state-of-the-art":[148],"model":[149],"shows":[150],"steep":[152],"drop":[153],"F1-score,":[155],"81.82%":[157],"down":[158],"56.62%.":[160],"Motivated":[161],"by":[162,325],"these":[163,215,287,295,334],"observations,":[164],"develop":[166],"new":[168],"training":[169,342,363],"strategy":[170],"fine-tune":[172],"classifier,":[176,233],"FlakyLens,":[177],"improves":[179],"F1-score":[182],"65.79%":[184],"(9.17pp":[185],"higher":[186,270],"state-of-the-art).":[189],"We":[190],"also":[191],"compare":[192],"FlakyLens":[193,212],"against":[194],"recent":[195],"pre-trained":[196],"LLMs,":[197],"such":[198],"as":[199,326,328],"CodeLlama":[200],"DeepSeekCoder,":[202],"task.":[207,227],"Our":[208,308],"results":[209,331],"consistently":[213],"outperforms":[214],"models,":[216],"highlighting":[217],"general-purpose":[219],"LLMs":[220],"still":[221,336],"fall":[222],"short":[223],"specialized":[226],"Using":[228],"our":[229],"improved":[230],"identify":[235],"important":[237,288,296,318],"tokens":[238,267,297,349],"code":[242,259],"influence":[244,285],"correct":[249],"or":[250],"predictions.":[252],"By":[253],"leveraging":[254],"attribution":[255],"scores":[256],"computed":[257],"per":[258,278],"token":[260],"each":[262],"test,":[263],"investigate":[265],"have":[269],"impact":[271],"classifier\u2019s":[276],"decision-making":[277],"category.":[281],"To":[282],"assess":[283],"tokens,":[289,319],"introduce":[291],"adversarial":[292],"perturbation":[293],"using":[294,315],"observe":[302],"whether":[303],"model\u2019s":[305],"predictions":[306],"change.":[307],"findings":[309],"that,":[311],"introducing":[313],"perturbations":[314],"most":[317],"change":[324],"much":[327],"-18.37pp.":[329],"highlight":[332],"struggle":[337],"generalize":[339],"beyond":[340],"data":[343],"rely":[345],"identifying":[347],"category-specific":[348],"(instead":[350],"understanding":[352],"semantic":[354],"context),":[355],"calling":[356],"for":[357],"further":[358],"research":[359],"robust":[362],"methodologies.":[364]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":1}],"updated_date":"2026-05-14T08:36:36.166977","created_date":"2025-10-10T00:00:00"}
