{"id":"https://openalex.org/W4389158415","doi":"https://doi.org/10.1145/3611643.3613090","title":"A Language Model of Java Methods with Train/Test Deduplication","display_name":"A Language Model of Java Methods with Train/Test Deduplication","publication_year":2023,"publication_date":"2023-11-30","ids":{"openalex":"https://openalex.org/W4389158415","doi":"https://doi.org/10.1145/3611643.3613090"},"language":"en","primary_location":{"id":"doi:10.1145/3611643.3613090","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3611643.3613090","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5111071005","display_name":"Chia\u2010Yi Su","orcid":"https://orcid.org/0000-0003-1803-560X"},"institutions":[{"id":"https://openalex.org/I107639228","display_name":"University of Notre Dame","ror":"https://ror.org/00mkhxb43","country_code":"US","type":"education","lineage":["https://openalex.org/I107639228"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Chia-Yi Su","raw_affiliation_strings":["University of Notre Dame, Notre Dame, USA"],"raw_orcid":"https://orcid.org/0000-0003-1803-560X","affiliations":[{"raw_affiliation_string":"University of Notre Dame, Notre Dame, USA","institution_ids":["https://openalex.org/I107639228"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006683167","display_name":"Aakash Bansal","orcid":"https://orcid.org/0000-0001-7475-7899"},"institutions":[{"id":"https://openalex.org/I107639228","display_name":"University of Notre Dame","ror":"https://ror.org/00mkhxb43","country_code":"US","type":"education","lineage":["https://openalex.org/I107639228"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Aakash Bansal","raw_affiliation_strings":["University of Notre Dame, Notre Dame, USA"],"raw_orcid":"https://orcid.org/0000-0001-7475-7899","affiliations":[{"raw_affiliation_string":"University of Notre Dame, Notre Dame, USA","institution_ids":["https://openalex.org/I107639228"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045149772","display_name":"Vijayanta Jain","orcid":"https://orcid.org/0000-0003-2652-5107"},"institutions":[{"id":"https://openalex.org/I7947594","display_name":"University of Maine","ror":"https://ror.org/01adr0w49","country_code":"US","type":"education","lineage":["https://openalex.org/I2802397601","https://openalex.org/I7947594"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Vijayanta Jain","raw_affiliation_strings":["University of Maine, Orono, USA"],"raw_orcid":"https://orcid.org/0000-0003-2652-5107","affiliations":[{"raw_affiliation_string":"University of Maine, Orono, USA","institution_ids":["https://openalex.org/I7947594"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072117004","display_name":"Sepideh Ghanavati","orcid":"https://orcid.org/0000-0001-7972-667X"},"institutions":[{"id":"https://openalex.org/I7947594","display_name":"University of Maine","ror":"https://ror.org/01adr0w49","country_code":"US","type":"education","lineage":["https://openalex.org/I2802397601","https://openalex.org/I7947594"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sepideh Ghanavati","raw_affiliation_strings":["University of Maine, Orono, USA"],"raw_orcid":"https://orcid.org/0000-0001-7972-667X","affiliations":[{"raw_affiliation_string":"University of Maine, Orono, USA","institution_ids":["https://openalex.org/I7947594"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5084874990","display_name":"Collin McMillan","orcid":"https://orcid.org/0009-0005-0887-1083"},"institutions":[{"id":"https://openalex.org/I107639228","display_name":"University of Notre Dame","ror":"https://ror.org/00mkhxb43","country_code":"US","type":"education","lineage":["https://openalex.org/I107639228"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Collin McMillan","raw_affiliation_strings":["University of Notre Dame, Notre Dame, USA"],"raw_orcid":"https://orcid.org/0009-0005-0887-1083","affiliations":[{"raw_affiliation_string":"University of Notre Dame, Notre Dame, USA","institution_ids":["https://openalex.org/I107639228"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5111071005"],"corresponding_institution_ids":["https://openalex.org/I107639228"],"apc_list":null,"apc_paid":null,"fwci":4.8533,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.95551885,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"2152","last_page":"2156"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.9200284481048584},{"id":"https://openalex.org/keywords/java","display_name":"Java","score":0.7651480436325073},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.5236139893531799},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.521973729133606},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.44373929500579834},{"id":"https://openalex.org/keywords/data-deduplication","display_name":"Data deduplication","score":0.4294130206108093},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.35973307490348816},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.3533565104007721}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9200284481048584},{"id":"https://openalex.org/C548217200","wikidata":"https://www.wikidata.org/wiki/Q251","display_name":"Java","level":2,"score":0.7651480436325073},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.5236139893531799},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.521973729133606},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.44373929500579834},{"id":"https://openalex.org/C32587265","wikidata":"https://www.wikidata.org/wiki/Q1182260","display_name":"Data deduplication","level":2,"score":0.4294130206108093},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.35973307490348816},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.3533565104007721}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3611643.3613090","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3611643.3613090","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5799999833106995,"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure"}],"awards":[{"id":"https://openalex.org/G1089092415","display_name":null,"funder_award_id":"2100035","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G1887447798","display_name":null,"funder_award_id":"2211428","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G5175445557","display_name":null,"funder_award_id":"CCF-2100035","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G7398002918","display_name":null,"funder_award_id":"CCF-2100035,CCF-2211428","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W46679369","https://openalex.org/W1736726159","https://openalex.org/W2003199018","https://openalex.org/W2136296681","https://openalex.org/W2964270303","https://openalex.org/W3040472729","https://openalex.org/W3093783890","https://openalex.org/W4200203799","https://openalex.org/W4225108562","https://openalex.org/W4285225959","https://openalex.org/W4304701465","https://openalex.org/W4313185449","https://openalex.org/W4313547549","https://openalex.org/W4321013654","https://openalex.org/W4378676756","https://openalex.org/W4384026634","https://openalex.org/W4385562549","https://openalex.org/W4385573610","https://openalex.org/W6601894380"],"related_works":["https://openalex.org/W3144870715","https://openalex.org/W3142319788","https://openalex.org/W2587188779","https://openalex.org/W3132870970","https://openalex.org/W4385804830","https://openalex.org/W2943088381","https://openalex.org/W2074021203","https://openalex.org/W2144348063","https://openalex.org/W4296125805","https://openalex.org/W1982579475"],"abstract_inverted_index":{"This":[0],"tool":[1],"demonstration":[2],"presents":[3],"a":[4,8,52,82,89,134],"research":[5,114],"toolkit":[6],"for":[7,43,67,76,151],"language":[9,38],"model":[10,87],"of":[11,26,60,113,118,137],"Java":[12,100,140,149],"source":[13,190],"code.":[14],"The":[15],"target":[16],"audience":[17],"includes":[18,98],"researchers":[19,44,169],"studying":[20],"problems":[21],"at":[22,163],"the":[23,63,119,147,179],"granularity":[24],"level":[25],"subroutines,":[27],"statements,":[28],"or":[29],"variables":[30],"in":[31,178],"Java.":[32],"In":[33],"contrast":[34],"to":[35,81,115,126,167],"many":[36],"existing":[37],"models,":[39],"we":[40,121],"prioritize":[41],"features":[42],"including":[45,146],"an":[46,72],"open":[47,189],"and":[48,71,104,187,191,195],"easily-searchable":[49],"training":[50,64,96,180],"set,":[51,65],"held":[53,138],"out":[54,139],"test":[55,135,174],"set":[56,97,136],"with":[57,92,128],"different":[58],"levels":[59],"deduplication":[61,157],"from":[62],"infrastructure":[66],"deduplicating":[68],"new":[69],"examples,":[70],"implementation":[73],"platform":[74],"suitable":[75],"execution":[77],"on":[78],"equipment":[79],"accessible":[80],"relatively":[83],"modest":[84],"budget.":[85],"Our":[86,95],"is":[88],"GPT2-like":[90],"architecture":[91],"350m":[93],"parameters.":[94],"52m":[99],"methods":[101,141],"(9b":[102],"tokens)":[103],"13m":[105],"StackOverflow":[106],"threads":[107],"(10.5b":[108],"tokens).":[109],"To":[110],"improve":[111],"accessibility":[112],"more":[116],"members":[117],"community,":[120],"limit":[122],"local":[123],"resource":[124],"requirements":[125],"GPUs":[127],"16GB":[129],"video":[130],"memory.":[131],"We":[132,154,182],"provide":[133,156],"that":[142,171],"include":[143],"descriptive":[144],"comments,":[145],"entire":[148],"projects":[150],"those":[152],"methods.":[153],"also":[155],"tools":[158,186],"using":[159],"precomputed":[160],"hash":[161],"tables":[162],"various":[164],"similarity":[165],"thresholds":[166],"help":[168],"ensure":[170],"their":[172],"own":[173],"examples":[175],"are":[176],"not":[177],"set.":[181],"make":[183],"all":[184],"our":[185],"data":[188],"available":[192],"via":[193],"Huggingface":[194],"Github.":[196]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":3}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
