{"id":"https://openalex.org/W4411271534","doi":"https://doi.org/10.1109/msr66628.2025.00080","title":"How Much Do Code Language Models Remember? An Investigation on Data Extraction Attacks Before and After Fine-tuning","display_name":"How Much Do Code Language Models Remember? An Investigation on Data Extraction Attacks Before and After Fine-tuning","publication_year":2025,"publication_date":"2025-04-28","ids":{"openalex":"https://openalex.org/W4411271534","doi":"https://doi.org/10.1109/msr66628.2025.00080"},"language":"en","primary_location":{"id":"doi:10.1109/msr66628.2025.00080","is_oa":false,"landing_page_url":"https://doi.org/10.1109/msr66628.2025.00080","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM 22nd International Conference on Mining Software Repositories (MSR)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5116082890","display_name":"Fabio Salerno","orcid":null},"institutions":[{"id":"https://openalex.org/I98358874","display_name":"Delft University of Technology","ror":"https://ror.org/02e2c7k09","country_code":"NL","type":"education","lineage":["https://openalex.org/I98358874"]}],"countries":["NL"],"is_corresponding":true,"raw_author_name":"Fabio Salerno","raw_affiliation_strings":["Delft University of Technology,Delft,The Netherlands"],"affiliations":[{"raw_affiliation_string":"Delft University of Technology,Delft,The Netherlands","institution_ids":["https://openalex.org/I98358874"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055120592","display_name":"Ali Al-Kaswan","orcid":"https://orcid.org/0000-0001-7338-2044"},"institutions":[{"id":"https://openalex.org/I98358874","display_name":"Delft University of Technology","ror":"https://ror.org/02e2c7k09","country_code":"NL","type":"education","lineage":["https://openalex.org/I98358874"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Ali Al-Kaswan","raw_affiliation_strings":["Delft University of Technology,Delft,The Netherlands"],"affiliations":[{"raw_affiliation_string":"Delft University of Technology,Delft,The Netherlands","institution_ids":["https://openalex.org/I98358874"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5064355563","display_name":"Maliheh Izadi","orcid":"https://orcid.org/0000-0001-5093-5523"},"institutions":[{"id":"https://openalex.org/I98358874","display_name":"Delft University of Technology","ror":"https://ror.org/02e2c7k09","country_code":"NL","type":"education","lineage":["https://openalex.org/I98358874"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Maliheh Izadi","raw_affiliation_strings":["Delft University of Technology,Delft,The Netherlands"],"affiliations":[{"raw_affiliation_string":"Delft University of Technology,Delft,The Netherlands","institution_ids":["https://openalex.org/I98358874"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5116082890"],"corresponding_institution_ids":["https://openalex.org/I98358874"],"apc_list":null,"apc_paid":null,"fwci":8.8474,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.97609166,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"465","last_page":"477"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12034","display_name":"Digital and Cyber Forensics","score":0.9962999820709229,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7753454446792603},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.55866539478302},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.48818665742874146},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4741078019142151},{"id":"https://openalex.org/keywords/extraction","display_name":"Extraction (chemistry)","score":0.4695616066455841},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3422900140285492}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7753454446792603},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.55866539478302},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.48818665742874146},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4741078019142151},{"id":"https://openalex.org/C4725764","wikidata":"https://www.wikidata.org/wiki/Q844704","display_name":"Extraction (chemistry)","level":2,"score":0.4695616066455841},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3422900140285492},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/msr66628.2025.00080","is_oa":false,"landing_page_url":"https://doi.org/10.1109/msr66628.2025.00080","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM 22nd International Conference on Mining Software Repositories (MSR)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":64,"referenced_works":["https://openalex.org/W2154652894","https://openalex.org/W2535690855","https://openalex.org/W2600463316","https://openalex.org/W2915824643","https://openalex.org/W2963341956","https://openalex.org/W2990138404","https://openalex.org/W3138815606","https://openalex.org/W3177765786","https://openalex.org/W4205807230","https://openalex.org/W4221159672","https://openalex.org/W4282045446","https://openalex.org/W4283364113","https://openalex.org/W4288057780","https://openalex.org/W4292779060","https://openalex.org/W4313547549","https://openalex.org/W4319988693","https://openalex.org/W4321175641","https://openalex.org/W4362706522","https://openalex.org/W4365794116","https://openalex.org/W4366974303","https://openalex.org/W4376606797","https://openalex.org/W4384026730","https://openalex.org/W4385270018","https://openalex.org/W4385474090","https://openalex.org/W4385573569","https://openalex.org/W4385734176","https://openalex.org/W4386081793","https://openalex.org/W4388092435","https://openalex.org/W4389009541","https://openalex.org/W4391107696","https://openalex.org/W4394744221","https://openalex.org/W4394744510","https://openalex.org/W4394745382","https://openalex.org/W4399577216","https://openalex.org/W4399795404","https://openalex.org/W4402665833","https://openalex.org/W4402671911","https://openalex.org/W4411271534","https://openalex.org/W6678262379","https://openalex.org/W6682631176","https://openalex.org/W6750200984","https://openalex.org/W6751104502","https://openalex.org/W6778883912","https://openalex.org/W6787335730","https://openalex.org/W6790588633","https://openalex.org/W6800949977","https://openalex.org/W6809994879","https://openalex.org/W6810332117","https://openalex.org/W6810463509","https://openalex.org/W6838795662","https://openalex.org/W6838986620","https://openalex.org/W6838995493","https://openalex.org/W6839352123","https://openalex.org/W6850364365","https://openalex.org/W6851946153","https://openalex.org/W6852109389","https://openalex.org/W6852887568","https://openalex.org/W6854805212","https://openalex.org/W6856051742","https://openalex.org/W6858040456","https://openalex.org/W6870378194","https://openalex.org/W6876571786","https://openalex.org/W6876954411","https://openalex.org/W6967084484"],"related_works":["https://openalex.org/W2377297411","https://openalex.org/W3148217948","https://openalex.org/W2375788636","https://openalex.org/W2358561207","https://openalex.org/W2975617233","https://openalex.org/W2388704129","https://openalex.org/W2392827053","https://openalex.org/W4231937131","https://openalex.org/W3188962172","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Code":[0],"language":[1,102],"models,":[2,46,170,232],"while":[3,233],"widely":[4],"popular,":[5],"are":[6,219],"often":[7],"trained":[8],"on":[9,76,181],"unsanitized":[10],"source":[11],"code":[12,101],"gathered":[13],"from":[14,143,228],"across":[15],"the":[16,26,41,52,106,119,161,185,220,234,237],"Internet.":[17],"Previous":[18],"work":[19],"revealed":[20],"that":[21,133,158,213],"pre-trained":[22,98,229],"models":[23,103,173],"can":[24,192],"remember":[25],"content":[27],"of":[28,44,108,121,136,163,188],"their":[29,175],"training":[30],"data":[31,36,82,109,139,178,214,223],"and":[32,63,70,99,124,207,216,230],"regurgitate":[33],"them":[34],"through":[35],"extraction":[37,128,179],"attacks.":[38,129],"Due":[39],"to":[40,104,117,127,149,168,177,194,225,240],"large":[42,71],"size":[43],"current":[45],"only":[47],"a":[48,114],"few":[49],"entities":[50,72],"have":[51],"resources":[53,62],"for":[54,73,83],"pre-training":[55,123,138,164],"such":[56],"models.":[57],"However,":[58,166],"fine-tuning":[59,125,171,182,189],"requires":[60],"fewer":[61],"is":[64,236],"increasingly":[65],"used":[66],"by":[67],"both":[68,97,122],"small":[69,80],"its":[74],"effectiveness":[75],"specialized":[77],"data.":[78,165,183],"Such":[79],"curated":[81],"finetuning":[84,159],"might":[85],"contain":[86],"sensitive":[87],"information":[88,218],"or":[89],"proprietary":[90],"assets.":[91],"In":[92],"this":[93,146,191],"study,":[94],"we":[95,199],"attack":[96],"fine-tuned":[100],"investigate":[105],"extent":[107],"extractability.":[110],"We":[111,210],"first":[112],"develop":[113],"custom":[115],"benchmark":[116],"assess":[118],"vulnerability":[120,176],"samples":[126,205],"Our":[130],"findings":[131],"reveal":[132],"$54.9":[134],"\\%$":[135],"extractable":[137,204],"could":[140],"be":[141,226,241],"retrieved":[142],"StarCoder2-15B,":[144],"whereas":[145],"number":[147],"decreased":[148],"$\\mathbf{2":[150],"3.":[151],"5":[152],"\\%}$":[153],"after":[154,208,243],"fine-tuning.":[155,209,244],"This":[156],"indicates":[157],"reduces":[160],"extractability":[162],"compared":[167],"larger":[169],"smaller":[172],"increases":[174],"attacks":[180],"Given":[184],"potential":[186],"sensitivity":[187],"data,":[190],"lead":[193],"more":[195],"severe":[196],"consequences.":[197],"Lastly,":[198],"also":[200,211],"manually":[201],"analyzed":[202],"2000":[203],"before":[206],"found":[212],"carriers":[215],"licensing":[217],"most":[221,238],"likely":[222,239],"categories":[224],"memorized":[227],"finetuned":[231],"latter":[235],"forgotten":[242]},"counts_by_year":[{"year":2025,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
