{"id":"https://openalex.org/W7131383301","doi":"https://doi.org/10.48550/arxiv.2602.20799","title":"Unseen-Codebases-Domain Data Synthesis and Training Based on Code Graphs","display_name":"Unseen-Codebases-Domain Data Synthesis and Training Based on Code Graphs","publication_year":2026,"publication_date":"2026-02-24","ids":{"openalex":"https://openalex.org/W7131383301","doi":"https://doi.org/10.48550/arxiv.2602.20799"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.20799","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126829456","display_name":"Guangsheng Ou","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ou, Guangsheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100639013","display_name":"Qiming Zhang","orcid":"https://orcid.org/0000-0003-0060-0543"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Qiming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126853258","display_name":"Sirong Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Sirong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111131227","display_name":"Anji Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Anji","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126842426","display_name":"Dong Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Dong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126826586","display_name":"Tiancheng Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Tiancheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053837157","display_name":"Dekun Dai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Dekun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126799510","display_name":"Cuiyun Gao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Cuiyun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126809010","display_name":"Long Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Long","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126844433","display_name":"Jun Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Jun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126823609","display_name":"Mingwei Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Mingwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126788971","display_name":"Zibin Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Zibin","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5126829456"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.40070000290870667,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.40070000290870667,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.10689999908208847,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.09399999678134918,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codebase","display_name":"Codebase","score":0.9319999814033508},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.680899977684021},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.6345999836921692},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5101000070571899},{"id":"https://openalex.org/keywords/dependency-graph","display_name":"Dependency graph","score":0.49160000681877136},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.3910999894142151},{"id":"https://openalex.org/keywords/relation","display_name":"Relation (database)","score":0.3901999890804291},{"id":"https://openalex.org/keywords/code-review","display_name":"Code review","score":0.37529999017715454}],"concepts":[{"id":"https://openalex.org/C51929080","wikidata":"https://www.wikidata.org/wiki/Q2425187","display_name":"Codebase","level":3,"score":0.9319999814033508},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8895999789237976},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.680899977684021},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.6345999836921692},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5101000070571899},{"id":"https://openalex.org/C16311509","wikidata":"https://www.wikidata.org/wiki/Q4148050","display_name":"Dependency graph","level":3,"score":0.49160000681877136},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.43290001153945923},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3910999894142151},{"id":"https://openalex.org/C25343380","wikidata":"https://www.wikidata.org/wiki/Q277521","display_name":"Relation (database)","level":2,"score":0.3901999890804291},{"id":"https://openalex.org/C150292731","wikidata":"https://www.wikidata.org/wiki/Q1342704","display_name":"Code review","level":5,"score":0.37529999017715454},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35830000042915344},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.35249999165534973},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34549999237060547},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.33059999346733093},{"id":"https://openalex.org/C2987255567","wikidata":"https://www.wikidata.org/wiki/Q33002955","display_name":"Knowledge graph","level":2,"score":0.3296000063419342},{"id":"https://openalex.org/C152752567","wikidata":"https://www.wikidata.org/wiki/Q116877","display_name":"Code refactoring","level":3,"score":0.3156999945640564},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.2976999878883362},{"id":"https://openalex.org/C16910744","wikidata":"https://www.wikidata.org/wiki/Q7705759","display_name":"Test data","level":2,"score":0.29589998722076416},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.29269999265670776},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.29030001163482666},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.2847000062465668},{"id":"https://openalex.org/C137287247","wikidata":"https://www.wikidata.org/wiki/Q1329550","display_name":"Static program analysis","level":4,"score":0.28189998865127563},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2815999984741211},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2728999853134155},{"id":"https://openalex.org/C2776145971","wikidata":"https://www.wikidata.org/wiki/Q30673951","display_name":"Labeled data","level":2,"score":0.2685999870300293},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.266400009393692},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2644999921321869},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2639000117778778}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.20799","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.20799","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.20799","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.20799","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0],"the":[1,58,71,160],"context":[2],"of":[3,20,64,101,187],"newly":[4],"release":[5],"software":[6],"frameworks,":[7],"large":[8,99],"language":[9],"models":[10,54],"(LLMs)":[11],"often":[12],"exhibit":[13],"poor":[14],"performance":[15],"and":[16,74,97,205,223],"a":[17,65,140,150,165,213],"high":[18],"rate":[19],"hallucination,":[21],"as":[22,37,110],"they":[23],"are":[24,118],"not":[25],"exposed":[26],"to":[27,52,55,68,120,129,163],"such":[28,36],"environments":[29],"during":[30],"training.":[31],"Although":[32,76],"inference-time":[33],"augmentation":[34],"techniques":[35],"retrieval-augmented":[38],"generation":[39,219],"(RAG)":[40],"can":[41,80,106],"partially":[42],"mitigate":[43],"hallucinations,":[44],"knowledge":[45,78],"injection":[46,79],"through":[47,83],"prompting":[48],"alone":[49],"is":[50],"insufficient":[51,119],"enable":[53],"fully":[56],"understand":[57],"intrinsic":[59],"relationships":[60],"among":[61],"different":[62],"components":[63],"codebase,":[66],"or":[67],"reason":[69],"about":[70],"correct":[72],"compositions":[73],"apply.":[75],"explicit":[77,192],"be":[81,107],"achieved":[82],"post-training,":[84],"compared":[85],"with":[86,191],"public":[87],"code":[88,96,104,131,151,162,166,218],"domains,":[89],"unseen":[90,123,155,221],"codebases":[91,124,222],"typically":[92],"provide":[93],"only":[94],"source":[95,130,161],"lack":[98],"volumes":[100],"high-quality,":[102],"usage-oriented":[103],"that":[105],"directly":[108],"leveraged":[109],"training":[111,142],"data.":[112,209],"Consequently,":[113],"existing":[114],"data":[115,146,189],"synthesis":[116,147],"approaches":[117],"adequately":[121],"capture":[122],"usage":[125],"scenarios":[126],"when":[127],"restricted":[128],"alone.":[132],"To":[133],"address":[134],"these":[135],"challenges,":[136],"we":[137],"propose":[138],"UCD-Training,":[139],"two-stage":[141],"framework":[143],"for":[144,217],"reasoning-aware":[145],"grounded":[148],"in":[149],"graph":[152],"constructed":[153],"from":[154],"codebases.":[156,229],"UCD-Training":[157],"first":[158],"parses":[159],"build":[164],"graph,":[167],"then":[168],"conducts":[169],"dependency-preserving":[170],"continued":[171],"pretraining":[172],"(CPT)":[173],"using":[174],"file-level":[175],"dependency":[176],"data,":[177,199,204],"followed":[178],"by":[179],"graph-grounded":[180],"supervised":[181],"fine-tuning":[182],"(SFT)":[183],"on":[184,220],"three":[185],"types":[186],"synthesized":[188],"augmented":[190],"reasoning":[193,198,203],"traces:":[194],"(1)":[195],"single-hop":[196],"relation":[197],"(2)":[200],"compositional":[201],"API":[202],"(3)":[206],"codebase":[207],"utilization":[208],"We":[210],"further":[211],"introduce":[212],"new":[214],"benchmark,":[215],"UnseenCodeBench,":[216],"conduct":[224],"comprehensive":[225],"experiments":[226],"across":[227],"multiple":[228]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-26T00:00:00"}
