{"id":"https://openalex.org/W4417447657","doi":"https://doi.org/10.48550/arxiv.2512.12117","title":"Citation-Grounded Code Comprehension: Preventing LLM Hallucination Through Hybrid Retrieval and Graph-Augmented Context","display_name":"Citation-Grounded Code Comprehension: Preventing LLM Hallucination Through Hybrid Retrieval and Graph-Augmented Context","publication_year":2025,"publication_date":"2025-12-13","ids":{"openalex":"https://openalex.org/W4417447657","doi":"https://doi.org/10.48550/arxiv.2512.12117"},"language":null,"primary_location":{"id":"pmh:oai:arXiv.org:2512.12117","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.12117","pdf_url":"https://arxiv.org/pdf/2512.12117","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2512.12117","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5070931114","display_name":"Jahidul Arafat","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Arafat, Jahidul","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5070931114"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9463000297546387,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10260","display_name":"Software Engineering Research","score":0.9463000297546387,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.01119999960064888,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.007899999618530273,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/citation","display_name":"Citation","score":0.6219000220298767},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.5134999752044678},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.39879998564720154},{"id":"https://openalex.org/keywords/program-comprehension","display_name":"Program comprehension","score":0.385699987411499},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.3813000023365021},{"id":"https://openalex.org/keywords/comprehension","display_name":"Comprehension","score":0.3797000050544739},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.36169999837875366},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.35429999232292175}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8463000059127808},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6452999711036682},{"id":"https://openalex.org/C2778805511","wikidata":"https://www.wikidata.org/wiki/Q1713","display_name":"Citation","level":2,"score":0.6219000220298767},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.5134999752044678},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.44589999318122864},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.39879998564720154},{"id":"https://openalex.org/C2777561058","wikidata":"https://www.wikidata.org/wiki/Q2652119","display_name":"Program comprehension","level":4,"score":0.385699987411499},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.3813000023365021},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.3797000050544739},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.36169999837875366},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35589998960494995},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.35429999232292175},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.32350000739097595},{"id":"https://openalex.org/C519991488","wikidata":"https://www.wikidata.org/wiki/Q28865","display_name":"Python (programming language)","level":2,"score":0.31769999861717224},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.314300000667572},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.27880001068115234},{"id":"https://openalex.org/C97250363","wikidata":"https://www.wikidata.org/wiki/Q235557","display_name":"File format","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C51929080","wikidata":"https://www.wikidata.org/wiki/Q2425187","display_name":"Codebase","level":3,"score":0.2660999894142151},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.2651999890804291},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.2623000144958496},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.2587999999523163},{"id":"https://openalex.org/C99016210","wikidata":"https://www.wikidata.org/wiki/Q5488129","display_name":"Query expansion","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2512.12117","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.12117","pdf_url":"https://arxiv.org/pdf/2512.12117","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2512.12117","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.12117","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2512.12117","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.12117","pdf_url":"https://arxiv.org/pdf/2512.12117","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1,18],"models":[2],"have":[3],"become":[4],"essential":[5],"tools":[6],"for":[7,139,147],"code":[8,50,135,148],"comprehension,":[9],"enabling":[10],"developers":[11],"to":[12,29,36,117,192],"query":[13],"unfamiliar":[14],"codebases":[15],"through":[16,52],"natural":[17],"interfaces.":[19],"However,":[20],"LLM":[21],"hallucination,":[22],"generating":[23],"plausible":[24],"but":[25,120],"factually":[26],"incorrect":[27],"citations":[28],"source":[30],"code,":[31],"remains":[32],"a":[33,167],"critical":[34],"barrier":[35],"reliable":[37],"developer":[38,72],"assistance.":[39],"This":[40],"paper":[41],"addresses":[42],"the":[43,93,114],"challenges":[44,87],"of":[45,88,209],"achieving":[46,156],"verifiable,":[47],"citation":[48,81,89,118,140,159],"grounded":[49,62,141],"comprehension":[51,149],"hybrid":[53,168],"retrieval":[54,75,169],"and":[55,80,102,151,178],"lightweight":[56],"structural":[57],"reasoning.":[58],"Our":[59],"work":[60],"is":[61,113,122],"in":[63,206],"systematic":[64],"evaluation":[65],"across":[66],"30":[67],"Python":[68],"repositories":[69],"with":[70,161],"180":[71],"queries,":[73],"comparing":[74],"modalities,":[76],"graph":[77,180],"expansion":[78,181],"strategies,":[79],"verification":[82],"mechanisms.":[83],"We":[84,137],"find":[85],"that":[86],"accuracy":[90,160],"arise":[91],"from":[92],"interplay":[94],"between":[95],"sparse":[96,173],"lexical":[97],"matching,":[98,174],"dense":[99,176],"semantic":[100],"similarity,":[101],"cross":[103,109,198],"file":[104,110,199],"architectural":[105,145,210],"dependencies.":[106],"Among":[107],"these,":[108],"evidence":[111,200],"discovery":[112],"largest":[115],"contributor":[116],"completeness,":[119],"it":[121],"largely":[123],"overlooked":[124],"because":[125],"existing":[126],"systems":[127,150],"rely":[128],"on":[129],"pure":[130,203],"textual":[131],"similarity":[132,205],"without":[133],"leveraging":[134],"structure.":[136],"advocate":[138],"generation":[142],"as":[143],"an":[144],"principle":[146],"demonstrate":[152],"this":[153],"need":[154],"by":[155,190,202],"92":[157],"percent":[158,208],"zero":[162],"hallucinations.":[163],"Specifically,":[164],"we":[165],"develop":[166],"system":[170],"combining":[171],"BM25":[172],"BGE":[175],"embeddings,":[177],"Neo4j":[179],"via":[182],"import":[183],"relationships,":[184],"which":[185],"outperforms":[186],"single":[187],"mode":[188],"baselines":[189],"14":[191],"18":[193],"percentage":[194],"points":[195],"while":[196],"discovering":[197],"missed":[201],"text":[204],"62":[207],"queries.":[211]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2025-12-17T00:00:00"}
