{"id":"https://openalex.org/W7153179491","doi":"https://doi.org/10.48550/arxiv.2604.07769","title":"An Empirical Study on Influence-Based Pretraining Data Selection for Code Large Language Models","display_name":"An Empirical Study on Influence-Based Pretraining Data Selection for Code Large Language Models","publication_year":2026,"publication_date":"2026-04-09","ids":{"openalex":"https://openalex.org/W7153179491","doi":"https://doi.org/10.48550/arxiv.2604.07769"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.07769","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07769","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.07769","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5039008475","display_name":"Chengli Xing","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xing, Chengli","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010506833","display_name":"Zhengran Zeng","orcid":"https://orcid.org/0009-0009-8422-4522"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeng, Zhengran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021257547","display_name":"Gexiang Fang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fang, Gexiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133373356","display_name":"Rui Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Rui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133380598","display_name":"Wei Ye","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133341071","display_name":"Shikun Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Shikun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.42010000348091125,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.42010000348091125,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.09019999951124191,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.08510000258684158,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/empirical-research","display_name":"Empirical research","score":0.5174000263214111},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.48829999566078186},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.429500013589859},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.4253000020980835},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.4239000082015991},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.38499999046325684},{"id":"https://openalex.org/keywords/genetic-programming","display_name":"Genetic programming","score":0.35899999737739563},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3492000102996826},{"id":"https://openalex.org/keywords/data-quality","display_name":"Data quality","score":0.3476000130176544}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8100000023841858},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.6208000183105469},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5235000252723694},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.5174000263214111},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.48829999566078186},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.429500013589859},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.4253000020980835},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.4239000082015991},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.38499999046325684},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.38199999928474426},{"id":"https://openalex.org/C110332635","wikidata":"https://www.wikidata.org/wiki/Q629498","display_name":"Genetic programming","level":2,"score":0.35899999737739563},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3492000102996826},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.3476000130176544},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.34619998931884766},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.33820000290870667},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.3368000090122223},{"id":"https://openalex.org/C199519371","wikidata":"https://www.wikidata.org/wiki/Q942695","display_name":"Source lines of code","level":3,"score":0.3359000086784363},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3327000141143799},{"id":"https://openalex.org/C45804977","wikidata":"https://www.wikidata.org/wiki/Q7239673","display_name":"Predictive modelling","level":2,"score":0.31929999589920044},{"id":"https://openalex.org/C133199616","wikidata":"https://www.wikidata.org/wiki/Q25386885","display_name":"Empirical modelling","level":2,"score":0.31619998812675476},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.30790001153945923},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.3050999939441681},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.2955999970436096},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.2678999900817871},{"id":"https://openalex.org/C50033165","wikidata":"https://www.wikidata.org/wiki/Q15712089","display_name":"Inductive programming","level":3,"score":0.26260000467300415},{"id":"https://openalex.org/C150292731","wikidata":"https://www.wikidata.org/wiki/Q1342704","display_name":"Code review","level":5,"score":0.251800000667572}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.07769","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07769","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.07769","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07769","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advancements":[1],"in":[2,12],"code":[3,140],"large":[4],"language":[5],"models":[6,114,205],"(Code-LLMs)":[7],"have":[8,19],"demonstrated":[9],"remarkable":[10],"capabilities":[11],"resolving":[13],"programming":[14,52,97,183,206,223],"related":[15],"tasks.":[16,224],"Meanwhile,":[17],"researchers":[18],"recognized":[20],"that":[21,197,211],"the":[22,36,65,79,113,153,171,187,212],"quality":[23],"of":[24,35,67,81,104,137,155,173,189,214],"pre-training":[25,40],"data":[26,41,72,176,217],"is":[27],"crucial":[28],"for":[29,51,92,95],"improving":[30],"LLM":[31],"performance.":[32,207],"However,":[33],"most":[34],"existing":[37],"research":[38],"on":[39,45,116,134,143,201],"filtering":[42,73,157,192,199],"has":[43],"focused":[44],"general":[46,71],"datasets,":[47],"and":[48,111,182,185],"little":[49],"attention":[50],"datasets.":[53,83],"In":[54],"this":[55,61,85,164],"paper,":[56],"we":[57,87,124,145,160,209],"aim":[58],"to":[59,151],"address":[60],"gap":[62],"by":[63],"exploring":[64],"effectiveness":[66,154],"a":[68,90,102,120,126,135],"widely":[69],"used":[70],"technique,":[74],"i.e.,":[75],"data-influence-score":[76,94,156,191,198],"filtering,":[77],"within":[78],"context":[80],"programming-related":[82],"To":[84],"end,":[86],"first":[88],"introduce":[89],"method":[91],"calculating":[93],"generative":[96],"tasks":[98,107],"which":[99],"involves":[100],"transforming":[101],"variety":[103],"downstream":[105,222],"coding":[106],"into":[108],"validation":[109],"sets":[110,118],"using":[112],"loss":[115],"these":[117],"as":[119],"performance":[121],"metric.":[122],"Next,":[123],"pre-train":[125],"Code-LLMs":[127],"with":[128],"1":[129],"billion":[130,139],"parameters":[131],"from":[132],"scratch":[133],"dataset":[136],"100":[138],"tokens.":[141],"Based":[142],"it,":[144],"conduct":[146],"an":[147],"extensive":[148],"empirical":[149],"study":[150],"evaluate":[152],"methods.":[158],"Specifically,":[159],"examine":[161],"how":[162,170],"well":[163],"technique":[165],"improves":[166],"model":[167],"performance,":[168],"investigate":[169],"characteristics":[172],"beneficial":[174,215],"training":[175,180,216],"vary":[177],"across":[178,220],"different":[179],"stages":[181],"tasks,":[184],"assess":[186],"feasibility":[188],"prediction-based":[190],"method.":[193],"Our":[194],"findings":[195],"show":[196],"based":[200],"validation-set-loss":[202],"can":[203],"enhance":[204],"Moreover,":[208],"observe":[210],"criteria":[213],"differ":[218],"significantly":[219],"various":[221]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-11T00:00:00"}
