{"id":"https://openalex.org/W4411449848","doi":"https://doi.org/10.1145/3715765","title":"Has My Code Been Stolen for Model Training? A Naturalness Based Approach to Code Contamination Detection","display_name":"Has My Code Been Stolen for Model Training? A Naturalness Based Approach to Code Contamination Detection","publication_year":2025,"publication_date":"2025-06-19","ids":{"openalex":"https://openalex.org/W4411449848","doi":"https://doi.org/10.1145/3715765"},"language":"en","primary_location":{"id":"doi:10.1145/3715765","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3715765","pdf_url":null,"source":{"id":"https://openalex.org/S4404663975","display_name":"Proceedings of the ACM on software engineering.","issn_l":"2994-970X","issn":["2994-970X"],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Software Engineering","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1145/3715765","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5065346491","display_name":"Haris Ali Khan","orcid":"https://orcid.org/0009-0003-5423-2827"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Haris Ali Khan","raw_affiliation_strings":["Beijing Institute of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077424533","display_name":"Yanjie Jiang","orcid":"https://orcid.org/0000-0001-6404-9143"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]},{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanjie Jiang","raw_affiliation_strings":["Beijing Institute of Technology, Beijing, China","Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]},{"raw_affiliation_string":"Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076497507","display_name":"Qasim Umer","orcid":"https://orcid.org/0000-0002-0237-3025"},"institutions":[{"id":"https://openalex.org/I134085113","display_name":"King Fahd University of Petroleum and Minerals","ror":"https://ror.org/03yez3163","country_code":"SA","type":"education","lineage":["https://openalex.org/I134085113"]}],"countries":["SA"],"is_corresponding":false,"raw_author_name":"Qasim Umer","raw_affiliation_strings":["King Fahd University of Petroleum and Minerals, Dhahran, Saudi Arabia"],"affiliations":[{"raw_affiliation_string":"King Fahd University of Petroleum and Minerals, Dhahran, Saudi Arabia","institution_ids":["https://openalex.org/I134085113"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101439496","display_name":"Yuxia Zhang","orcid":"https://orcid.org/0000-0002-9371-5931"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuxia Zhang","raw_affiliation_strings":["Beijing Institute of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102178933","display_name":"Waseem Akram","orcid":null},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Waseem Akram","raw_affiliation_strings":["Beijing Institute of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5024721944","display_name":"Hui Liu","orcid":"https://orcid.org/0000-0002-3267-6801"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hui Liu","raw_affiliation_strings":["Beijing Institute of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5065346491"],"corresponding_institution_ids":["https://openalex.org/I125839683"],"apc_list":null,"apc_paid":null,"fwci":3.523,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.92737002,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"2","issue":"FSE","first_page":"1046","last_page":"1067"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12034","display_name":"Digital and Cyber Forensics","score":0.9646999835968018,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.9156690835952759},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7371830940246582},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.6601772904396057},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.5884284377098083},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.43760475516319275},{"id":"https://openalex.org/keywords/code-review","display_name":"Code review","score":0.42002055048942566},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3935723304748535},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.3603360652923584},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3294519782066345},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3239448368549347},{"id":"https://openalex.org/keywords/software-quality","display_name":"Software quality","score":0.27558737993240356},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.26411178708076477},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.2482450306415558},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.14809074997901917},{"id":"https://openalex.org/keywords/software-development","display_name":"Software development","score":0.09829959273338318}],"concepts":[{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.9156690835952759},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7371830940246582},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.6601772904396057},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.5884284377098083},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.43760475516319275},{"id":"https://openalex.org/C150292731","wikidata":"https://www.wikidata.org/wiki/Q1342704","display_name":"Code review","level":5,"score":0.42002055048942566},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3935723304748535},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3603360652923584},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3294519782066345},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3239448368549347},{"id":"https://openalex.org/C117447612","wikidata":"https://www.wikidata.org/wiki/Q1412670","display_name":"Software quality","level":4,"score":0.27558737993240356},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.26411178708076477},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.2482450306415558},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.14809074997901917},{"id":"https://openalex.org/C529173508","wikidata":"https://www.wikidata.org/wiki/Q638608","display_name":"Software development","level":3,"score":0.09829959273338318},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3715765","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3715765","pdf_url":null,"source":{"id":"https://openalex.org/S4404663975","display_name":"Proceedings of the ACM on software engineering.","issn_l":"2994-970X","issn":["2994-970X"],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Software Engineering","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3715765","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3715765","pdf_url":null,"source":{"id":"https://openalex.org/S4404663975","display_name":"Proceedings of the ACM on software engineering.","issn_l":"2994-970X","issn":["2994-970X"],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Software Engineering","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W1543330234","https://openalex.org/W1771830246","https://openalex.org/W1977678951","https://openalex.org/W2140609933","https://openalex.org/W2165747537","https://openalex.org/W2344444819","https://openalex.org/W2795435272","https://openalex.org/W2913107170","https://openalex.org/W2921974495","https://openalex.org/W2954274464","https://openalex.org/W2979792666","https://openalex.org/W2997591727","https://openalex.org/W2998220322","https://openalex.org/W3154151289","https://openalex.org/W4235670907","https://openalex.org/W4237993802","https://openalex.org/W4252386722","https://openalex.org/W4281763794","https://openalex.org/W4283214976","https://openalex.org/W4285306484","https://openalex.org/W4285602147","https://openalex.org/W4324296965","https://openalex.org/W4376121550","https://openalex.org/W4377111800","https://openalex.org/W4385565597","https://openalex.org/W4387298393","https://openalex.org/W4387985331","https://openalex.org/W4387994704","https://openalex.org/W4388651113","https://openalex.org/W4388675518","https://openalex.org/W4391591395","https://openalex.org/W4399631315","https://openalex.org/W4403579163"],"related_works":["https://openalex.org/W4308641625","https://openalex.org/W2904997879","https://openalex.org/W2344367508","https://openalex.org/W3081644756","https://openalex.org/W1997548934","https://openalex.org/W2097696338","https://openalex.org/W2969773072","https://openalex.org/W2909969119","https://openalex.org/W4389600167","https://openalex.org/W4285169523"],"abstract_inverted_index":{"It":[0,181,297],"is":[1,119],"often":[2,77,134],"valuable":[3],"to":[4,18,120,140,156,168,186,192,305],"know":[5],"whether":[6],"a":[7,20,147,174,184,284],"given":[8,21,112,123,175,205],"piece":[9],"of":[10,39,83,110,173,199],"source":[11,159,176,189,206,307],"code":[12,52,160,166,177,190,194,308,312],"has":[13],"or":[14,50],"hasn\u2019t":[15],"been":[16,65],"used":[17],"train":[19],"deep":[22],"learning":[23],"model.":[24],"On":[25],"one":[26],"side,":[27],"it":[28,43,118,266],"helps":[29],"avoid":[30],"data":[31,72,98,113,128,132],"contamination":[32,73,233],"problems":[33],"that":[34,88,226,239,300],"may":[35,298],"exaggerate":[36],"the":[37,68,81,84,89,108,111,122,171,197,200,204,243,251,271,275,291],"performance":[38,198],"evaluated":[40],"models.":[41,180],"Conversely,":[42],"facilitates":[44],"copyright":[45],"protection":[46],"by":[47,254,277,294],"identifying":[48],"private":[49],"protected":[51],"leveraged":[53],"for":[54,67,153,178,231],"model":[55],"training":[56],"without":[57],"permission.":[58],"To":[59],"this":[60,142,144],"end,":[61],"automated":[62],"approaches":[63,76,104,245,293],"have":[64],"proposed":[66],"detection,":[69],"known":[70],"as":[71],"detection.":[74],"Such":[75],"heavily":[78],"rely":[79],"on":[80,203,283],"confidence":[82],"involved":[85],"models,":[86],"assuming":[87],"models":[90,155,202,216,225],"should":[91],"be":[92,303],"more":[93,268],"confident":[94],"in":[95,246],"handling":[96],"contaminated":[97,127,158,188,248],"than":[99,270],"cleaned":[100,131,162],"data.":[101,234],"However,":[102],"such":[103],"do":[105],"not":[106],"consider":[107],"nature":[109],"item,":[114],"i.e.,":[115],"how":[116],"difficult":[117],"predict":[121],"item.":[124],"Consequently,":[125],"difficult-to-predict":[126],"and":[129,196,219,222,265],"easy-to-predict":[130],"are":[133],"misclassified.":[135],"As":[136],"an":[137],"initial":[138],"attempt":[139],"solve":[141],"problem,":[143],"paper":[145],"presents":[146],"naturalness-based":[148],"approach,":[149],"called":[150],"Natural-DaCoDe":[151,164,210,240,259,280,301],",":[152],"code-completion":[154,179,201,224],"distinguish":[157,187],"from":[161,229],"ones.":[163],"leverages":[165],"naturalness":[167,195],"quantitatively":[169],"measure":[170],"difficulty":[172],"then":[182],"trains":[183],"classifier":[185],"according":[191],"both":[193],"code.":[207],"We":[208,256],"evaluate":[209,258],"with":[211,260],"two":[212,223],"pre-trained":[213],"large":[214],"language":[215,286],"(e.g.,":[217],"ChatGPT":[218],"Claude":[220],")":[221],"we":[227],"trained":[228],"scratch":[230],"detecting":[232,247],"Our":[235],"evaluation":[236],"results":[237],"suggest":[238,299],"substantially":[241],"outperformed":[242],"state-of-the-art":[244,272,292],"data,":[249],"improving":[250,274],"average":[252],"accuracy":[253,276],"61.78%.":[255],"also":[257],"method":[261],"name":[262],"suggestion":[263],"task,":[264],"remains":[267],"accurate":[269],"approaches,":[273],"54.39%.":[278],"Furthermore,":[279],"was":[281],"tested":[282],"natural":[285],"text":[287],"benchmark,":[288],"significantly":[289],"outperforming":[290],"22%":[295],".":[296],"could":[302],"applied":[304],"various":[306],"related":[309],"tasks":[310],"besides":[311],"complete.":[313]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
