{"id":"https://openalex.org/W4313563767","doi":"https://doi.org/10.1145/3551349.3556918","title":"Data Leakage in Notebooks: Static Detection and Better Processes","display_name":"Data Leakage in Notebooks: Static Detection and Better Processes","publication_year":2022,"publication_date":"2022-10-10","ids":{"openalex":"https://openalex.org/W4313563767","doi":"https://doi.org/10.1145/3551349.3556918"},"language":"en","primary_location":{"id":"doi:10.1145/3551349.3556918","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3551349.3556918","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3551349.3556918","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 37th IEEE/ACM International Conference on Automated Software Engineering","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3551349.3556918","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101966381","display_name":"Chenyang Yang","orcid":"https://orcid.org/0000-0001-5016-7296"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Chenyang Yang","raw_affiliation_strings":["Carnegie Mellon University, USA"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112865849","display_name":"Rachel A Brower-Sinning","orcid":null},"institutions":[{"id":"https://openalex.org/I114772536","display_name":"Software Engineering Institute","ror":"https://ror.org/01xqjjn94","country_code":"US","type":"facility","lineage":["https://openalex.org/I114772536","https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Rachel A Brower-Sinning","raw_affiliation_strings":["Carnegie Mellon Software Engineering Institute, USA"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon Software Engineering Institute, USA","institution_ids":["https://openalex.org/I114772536"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015259511","display_name":"Grace A. Lewis","orcid":"https://orcid.org/0000-0001-9128-9863"},"institutions":[{"id":"https://openalex.org/I114772536","display_name":"Software Engineering Institute","ror":"https://ror.org/01xqjjn94","country_code":"US","type":"facility","lineage":["https://openalex.org/I114772536","https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Grace Lewis","raw_affiliation_strings":["Carnegie Mellon Software Engineering Institute, United States of America"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon Software Engineering Institute, United States of America","institution_ids":["https://openalex.org/I114772536"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5067467896","display_name":"Christian K\u00e4stner","orcid":"https://orcid.org/0000-0002-4450-4572"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Christian K\u00c4Stner","raw_affiliation_strings":["Carnegie Mellon University, United States of America"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, United States of America","institution_ids":["https://openalex.org/I74973139"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5101966381"],"corresponding_institution_ids":["https://openalex.org/I74973139"],"apc_list":null,"apc_paid":null,"fwci":3.0536,"has_fulltext":true,"cited_by_count":23,"citation_normalized_percentile":{"value":0.9257015,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"12"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9947999715805054,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9947999715805054,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9936000108718872,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.9923999905586243,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/leakage","display_name":"Leakage (economics)","score":0.7615174055099487},{"id":"https://openalex.org/keywords/mistake","display_name":"Mistake","score":0.7369659543037415},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6889464259147644},{"id":"https://openalex.org/keywords/pipeline-transport","display_name":"Pipeline transport","score":0.5001299381256104},{"id":"https://openalex.org/keywords/static-analysis","display_name":"Static analysis","score":0.46942928433418274},{"id":"https://openalex.org/keywords/reliability-engineering","display_name":"Reliability engineering","score":0.4559004008769989},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.39573606848716736},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.3449675440788269},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.19337308406829834},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.09448304772377014}],"concepts":[{"id":"https://openalex.org/C2777042071","wikidata":"https://www.wikidata.org/wiki/Q6509304","display_name":"Leakage (economics)","level":2,"score":0.7615174055099487},{"id":"https://openalex.org/C2777179996","wikidata":"https://www.wikidata.org/wiki/Q911222","display_name":"Mistake","level":2,"score":0.7369659543037415},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6889464259147644},{"id":"https://openalex.org/C175309249","wikidata":"https://www.wikidata.org/wiki/Q725864","display_name":"Pipeline transport","level":2,"score":0.5001299381256104},{"id":"https://openalex.org/C97686452","wikidata":"https://www.wikidata.org/wiki/Q7604153","display_name":"Static analysis","level":2,"score":0.46942928433418274},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.4559004008769989},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.39573606848716736},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.3449675440788269},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.19337308406829834},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.09448304772377014},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C87717796","wikidata":"https://www.wikidata.org/wiki/Q146326","display_name":"Environmental engineering","level":1,"score":0.0},{"id":"https://openalex.org/C139719470","wikidata":"https://www.wikidata.org/wiki/Q39680","display_name":"Macroeconomics","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3551349.3556918","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3551349.3556918","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3551349.3556918","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 37th IEEE/ACM International Conference on Automated Software Engineering","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3551349.3556918","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3551349.3556918","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3551349.3556918","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 37th IEEE/ACM International Conference on Automated Software Engineering","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2144278397","display_name":null,"funder_award_id":"FA8702-15-D-0002","funder_id":"https://openalex.org/F4320306078","funder_display_name":"U.S. Department of Defense"},{"id":"https://openalex.org/G3315187950","display_name":null,"funder_award_id":"Contract No. FA8702-15-D-0002","funder_id":"https://openalex.org/F4320306078","funder_display_name":"U.S. Department of Defense"},{"id":"https://openalex.org/G3975186846","display_name":null,"funder_award_id":"under","funder_id":"https://openalex.org/F4320310207","funder_display_name":"Carnegie Mellon University"},{"id":"https://openalex.org/G4011700787","display_name":"SHF: SMALL: Streamlining Fork-Based Software Development","funder_award_id":"1813598","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G526907975","display_name":null,"funder_award_id":"FA8702-15-D-0002","funder_id":"https://openalex.org/F4320310207","funder_display_name":"Carnegie Mellon University"},{"id":"https://openalex.org/G6861468170","display_name":null,"funder_award_id":"1813598 and 2131477","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8246969693","display_name":null,"funder_award_id":"2131477","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320306078","display_name":"U.S. Department of Defense","ror":"https://ror.org/0447fe631"},{"id":"https://openalex.org/F4320310207","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4313563767.pdf","grobid_xml":"https://content.openalex.org/works/W4313563767.grobid-xml"},"referenced_works_count":28,"referenced_works":["https://openalex.org/W1555759181","https://openalex.org/W2030433745","https://openalex.org/W2087228724","https://openalex.org/W2595557058","https://openalex.org/W2736287575","https://openalex.org/W2752857821","https://openalex.org/W2796040126","https://openalex.org/W2922234936","https://openalex.org/W2941232686","https://openalex.org/W2951243568","https://openalex.org/W2954370766","https://openalex.org/W2966284335","https://openalex.org/W3012094108","https://openalex.org/W3016970897","https://openalex.org/W3043652335","https://openalex.org/W3089843629","https://openalex.org/W3103934428","https://openalex.org/W3110358918","https://openalex.org/W3125702975","https://openalex.org/W3155808134","https://openalex.org/W3184231327","https://openalex.org/W4206770190","https://openalex.org/W4210294742","https://openalex.org/W4221145303","https://openalex.org/W4247889999","https://openalex.org/W4254666025","https://openalex.org/W4284670548","https://openalex.org/W4365786623"],"related_works":["https://openalex.org/W1590719878","https://openalex.org/W4244271513","https://openalex.org/W2365974527","https://openalex.org/W4306382224","https://openalex.org/W4226517682","https://openalex.org/W3108263396","https://openalex.org/W2895872277","https://openalex.org/W1561425952","https://openalex.org/W4284675145","https://openalex.org/W2372021191"],"abstract_inverted_index":{"Data":[0],"science":[1,80],"pipelines":[2],"to":[3,27,37,62,71],"train":[4],"and":[5,22,60,92,115,117],"evaluate":[6],"models":[7,41],"with":[8],"machine":[9],"learning":[10],"may":[11,57],"contain":[12],"bugs":[13],"just":[14],"like":[15],"any":[16],"other":[17],"code.":[18,81],"Leakage":[19],"between":[20],"training":[21],"test":[23],"data":[24,76,79,90],"can":[25,46,111,121],"lead":[26],"overestimating":[28],"the":[29,125],"model\u2019s":[30],"accuracy":[31],"during":[32],"offline":[33],"evaluations,":[34],"possibly":[35],"leading":[36],"deployment":[38],"of":[39,75],"low-quality":[40],"in":[42,78],"production.":[43],"Such":[44],"leakage":[45,77,91,95,119],"happen":[47],"easily":[48],"by":[49,52],"mistake":[50],"or":[51],"following":[53],"poor":[54],"practices,":[55],"but":[56],"be":[58,122],"tedious":[59],"challenging":[61],"detect":[63,72],"manually.":[64],"We":[65,104],"develop":[66],"a":[67],"static":[68,108],"analysis":[69,87,109],"approach":[70,110],"common":[73],"forms":[74],"Our":[82],"evaluation":[83],"shows":[84],"that":[85,93],"our":[86,107],"accurately":[88],"detects":[89],"such":[94],"is":[96],"pervasive":[97],"among":[98],"over":[99],"100,000":[100],"analyzed":[101],"public":[102],"notebooks.":[103],"discuss":[105],"how":[106,118],"help":[112],"both":[113],"practitioners":[114],"educators,":[116],"prevention":[120],"designed":[123],"into":[124],"development":[126],"process.":[127]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":11},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":5}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
