{"id":"https://openalex.org/W4415583980","doi":"https://doi.org/10.1145/3755881.3755901","title":"CodeCleaner: Mitigating Data Contamination for LLM Benchmarking","display_name":"CodeCleaner: Mitigating Data Contamination for LLM Benchmarking","publication_year":2025,"publication_date":"2025-06-20","ids":{"openalex":"https://openalex.org/W4415583980","doi":"https://doi.org/10.1145/3755881.3755901"},"language":"en","primary_location":{"id":"doi:10.1145/3755881.3755901","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3755881.3755901","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 16th International Conference on Internetware","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5053372458","display_name":"Jialun Cao","orcid":"https://orcid.org/0000-0003-4892-6294"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Jialun Cao","raw_affiliation_strings":["Hong Kong University of Science and Technology, Hong Kong, China"],"affiliations":[{"raw_affiliation_string":"Hong Kong University of Science and Technology, Hong Kong, China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077331062","display_name":"Songqiang Chen","orcid":"https://orcid.org/0000-0002-1220-8728"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Songqiang Chen","raw_affiliation_strings":["The Hong Kong University of Science and Technology, Hong Kong, China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology, Hong Kong, China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077307201","display_name":"Wuqi Zhang","orcid":"https://orcid.org/0000-0001-8039-0528"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Wuqi Zhang","raw_affiliation_strings":["The Hong Kong University of Science and Technology, Hong Kong, China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology, Hong Kong, China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114730187","display_name":"Hau Ching Lo","orcid":"https://orcid.org/0009-0000-4919-6292"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Hau Ching Lo","raw_affiliation_strings":["The Hong Kong University of Science and Technology, Hong Kong, China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology, Hong Kong, China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037974123","display_name":"Yeting Li","orcid":"https://orcid.org/0000-0003-0991-4231"},"institutions":[{"id":"https://openalex.org/I4210137199","display_name":"Aerospace Information Research Institute","ror":"https://ror.org/0419fj215","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210137199"]},{"id":"https://openalex.org/I4210156404","display_name":"Institute of Information Engineering","ror":"https://ror.org/04r53se39","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210156404"]},{"id":"https://openalex.org/I4210159876","display_name":"Institute of Physics","ror":"https://ror.org/05cvf7v30","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210159876"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yeting Li","raw_affiliation_strings":["Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210156404","https://openalex.org/I4210137199","https://openalex.org/I4210159876"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5034057959","display_name":"Shing-Chi Cheung","orcid":"https://orcid.org/0000-0002-3508-7172"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Shing-Chi Cheung","raw_affiliation_strings":["The Hong Kong University of Science and Technology, Hong Kong, China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology, Hong Kong, China","institution_ids":["https://openalex.org/I200769079"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5053372458"],"corresponding_institution_ids":["https://openalex.org/I200769079"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.38543652,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"71","last_page":"83"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12034","display_name":"Digital and Cyber Forensics","score":0.9828000068664551,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9710999727249146,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/code-refactoring","display_name":"Code refactoring","score":0.8151999711990356},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.4878999888896942},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.44209998846054077},{"id":"https://openalex.org/keywords/python","display_name":"Python (programming language)","score":0.4242999851703644},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.41690000891685486},{"id":"https://openalex.org/keywords/credibility","display_name":"Credibility","score":0.41620001196861267},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.38429999351501465},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.3587999939918518},{"id":"https://openalex.org/keywords/unavailability","display_name":"Unavailability","score":0.3361999988555908}],"concepts":[{"id":"https://openalex.org/C152752567","wikidata":"https://www.wikidata.org/wiki/Q116877","display_name":"Code refactoring","level":3,"score":0.8151999711990356},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.628600001335144},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.49300000071525574},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.4878999888896942},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.44209998846054077},{"id":"https://openalex.org/C519991488","wikidata":"https://www.wikidata.org/wiki/Q28865","display_name":"Python (programming language)","level":2,"score":0.4242999851703644},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.41690000891685486},{"id":"https://openalex.org/C2780224610","wikidata":"https://www.wikidata.org/wiki/Q1530061","display_name":"Credibility","level":2,"score":0.41620001196861267},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.38429999351501465},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3587999939918518},{"id":"https://openalex.org/C2780505938","wikidata":"https://www.wikidata.org/wiki/Q17093282","display_name":"Unavailability","level":2,"score":0.3361999988555908},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.33059999346733093},{"id":"https://openalex.org/C72634772","wikidata":"https://www.wikidata.org/wiki/Q386824","display_name":"Data integration","level":2,"score":0.3075000047683716},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.3075000047683716},{"id":"https://openalex.org/C104054115","wikidata":"https://www.wikidata.org/wiki/Q216828","display_name":"Cohesion (chemistry)","level":2,"score":0.3043000102043152},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.2992999851703644},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.29910001158714294},{"id":"https://openalex.org/C27158222","wikidata":"https://www.wikidata.org/wiki/Q5532422","display_name":"Generalizability theory","level":2,"score":0.2962000072002411},{"id":"https://openalex.org/C2780615836","wikidata":"https://www.wikidata.org/wiki/Q2471869","display_name":"USable","level":2,"score":0.2865000069141388},{"id":"https://openalex.org/C2780009758","wikidata":"https://www.wikidata.org/wiki/Q6804172","display_name":"Measure (data warehouse)","level":2,"score":0.28360000252723694},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.28029999136924744},{"id":"https://openalex.org/C61423126","wikidata":"https://www.wikidata.org/wiki/Q187432","display_name":"Scripting language","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C199519371","wikidata":"https://www.wikidata.org/wiki/Q942695","display_name":"Source lines of code","level":3,"score":0.26600000262260437},{"id":"https://openalex.org/C192465680","wikidata":"https://www.wikidata.org/wiki/Q1413450","display_name":"Precondition","level":2,"score":0.2651999890804291},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.25270000100135803},{"id":"https://openalex.org/C97256817","wikidata":"https://www.wikidata.org/wiki/Q1462316","display_name":"Spurious relationship","level":2,"score":0.25220000743865967}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3755881.3755901","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3755881.3755901","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 16th International Conference on Internetware","raw_type":"proceedings-article"},{"id":"pmh:oai:repository.hkust.edu.hk:1783.1-167572","is_oa":false,"landing_page_url":"http://repository.hkust.edu.hk/ir/Record/1783.1-167572","pdf_url":null,"source":{"id":"https://openalex.org/S4306401796","display_name":"Rare & Special e-Zone (The Hong Kong University of Science and Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I200769079","host_organization_name":"Hong Kong University of Science and Technology","host_organization_lineage":["https://openalex.org/I200769079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Conference paper"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W2058373514","https://openalex.org/W2143861926","https://openalex.org/W2795435272","https://openalex.org/W2950368691","https://openalex.org/W3090988182","https://openalex.org/W3109744225","https://openalex.org/W3172642864","https://openalex.org/W3194983542","https://openalex.org/W3213241618","https://openalex.org/W4210497109","https://openalex.org/W4249744258","https://openalex.org/W4298289021","https://openalex.org/W4378591002","https://openalex.org/W4378942602","https://openalex.org/W4384345708","https://openalex.org/W4385565597","https://openalex.org/W4385965989","https://openalex.org/W4389518953","https://openalex.org/W4390442456","https://openalex.org/W4391579642","https://openalex.org/W4392402185","https://openalex.org/W4393407110","https://openalex.org/W4400190921","https://openalex.org/W4402670101","https://openalex.org/W4403536427","https://openalex.org/W4403536900","https://openalex.org/W4404780897","https://openalex.org/W4411272397"],"related_works":[],"abstract_inverted_index":{"Data":[0],"contamination":[1,143],"presents":[2,88],"a":[3,57],"critical":[4],"barrier":[5],"preventing":[6],"widespread":[7,79],"industrial":[8,80],"adoption":[9],"of":[10,42,67,97,168,205],"advanced":[11],"software":[12],"engineering":[13],"techniques":[14,76],"that":[15],"leverage":[16],"large":[17],"language":[18],"models":[19],"(LLMs).":[20],"This":[21],"phenomenon":[22],"occurs":[23],"when":[24,174],"evaluation":[25],"data":[26,62,142,185,229],"inadvertently":[27],"overlaps":[28],"with":[29],"the":[30,40,65,84,89,95,132,160],"public":[31],"code":[32,49,69,98],"repositories":[33],"used":[34],"to":[35,60,93,140,158,192,197,222],"train":[36],"LLMs,":[37],"severely":[38],"undermining":[39],"credibility":[41],"performance":[43,210],"evaluations.":[44],"Code":[45],"refactoring,":[46],"which":[47],"comprises":[48],"restructuring":[50],"and":[51,72,106,109,125,144,153],"variable":[52],"renaming,":[53],"has":[54,77],"emerged":[55],"as":[56],"promising":[58],"measure":[59],"mitigate":[61],"contamination.":[63,186,230],"However,":[64],"lack":[66],"automated":[68],"refactoring":[70,75,99],"tools":[71],"scientifically":[73],"validated":[74],"hampered":[78],"implementation.":[81],"To":[82],"bridge":[83],"gap,":[85],"this":[86],"paper":[87],"first":[90],"systematic":[91],"study":[92],"examine":[94],"efficacy":[96,161],"operators":[100,119,137,163,177,191],"at":[101,220],"multiple":[102,123],"scales":[103,124],"(method-level,":[104],"class-level,":[105],"cross-class":[107],"level)":[108],"in":[110,122,178,183,208],"different":[111],"programming":[112],"languages.":[113],"We":[114,129,200,215],"develop":[115],"CodeCleaner,":[116,179],"including":[117],"11":[118],"for":[120,127,134],"Python":[121],"4":[126],"Java.":[128],"elaborate":[130],"on":[131,226],"rationale":[133],"why":[135],"these":[136],"could":[138],"work":[139],"resolve":[141],"use":[145],"both":[146],"data-wise":[147],"(e.g.,":[148,156],"N-gram":[149],"matching":[150],"overlap":[151,170],"ratio)":[152],"model-wise":[154],"metrics":[155],"perplexity)":[157],"quantify":[159],"after":[162,211],"are":[164],"applied.":[165],"A":[166],"drop":[167],"75%":[169],"ratio":[171],"is":[172],"found":[173],"applying":[175,212],"all":[176],"demonstrating":[180],"their":[181,195],"effectiveness":[182],"addressing":[184],"Besides,":[187],"we":[188],"migrate":[189],"four":[190],"Java,":[193],"showing":[194],"generalizability":[196],"another":[198],"language.":[199],"also":[201],"observed":[202],"an":[203],"average":[204],"19%":[206],"decrease":[207],"LLMs\u2019":[209],"our":[213],"operators.":[214],"make":[216],"CodeCleaner":[217],"online":[218],"available":[219],"https://github.com/ArabelaTso/CodeCleaner-v1":[221],"facilitate":[223],"further":[224],"studies":[225],"mitigating":[227],"LLM":[228]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-28T00:00:00"}
