{"id":"https://openalex.org/W7118242908","doi":"https://doi.org/10.48550/arxiv.2601.00891","title":"Enhancing Retrieval-Augmented Generation with Topic-Enriched Embeddings: A Hybrid Approach Integrating Traditional NLP Techniques","display_name":"Enhancing Retrieval-Augmented Generation with Topic-Enriched Embeddings: A Hybrid Approach Integrating Traditional NLP Techniques","publication_year":2025,"publication_date":"2025-12-31","ids":{"openalex":"https://openalex.org/W7118242908","doi":"https://doi.org/10.48550/arxiv.2601.00891"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.00891","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.00891","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.00891","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5085697451","display_name":"Rodrigo Kataishi","orcid":"https://orcid.org/0000-0002-6316-1528"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kataishi, Rodrigo","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5085697451"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.8503000140190125,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.8503000140190125,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.03830000013113022,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.014700000174343586,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/latent-dirichlet-allocation","display_name":"Latent Dirichlet allocation","score":0.6712999939918518},{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.5612999796867371},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.5160999894142151},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.510699987411499},{"id":"https://openalex.org/keywords/coherence","display_name":"Coherence (philosophical gambling strategy)","score":0.5019000172615051},{"id":"https://openalex.org/keywords/topic-model","display_name":"Topic model","score":0.49140000343322754},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.4740999937057495},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4440999925136566},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.43220001459121704},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.43140000104904175}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7936999797821045},{"id":"https://openalex.org/C500882744","wikidata":"https://www.wikidata.org/wiki/Q269236","display_name":"Latent Dirichlet allocation","level":3,"score":0.6712999939918518},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6590999960899353},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5795000195503235},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.5612999796867371},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.5160999894142151},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.510699987411499},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.5019000172615051},{"id":"https://openalex.org/C171686336","wikidata":"https://www.wikidata.org/wiki/Q3532085","display_name":"Topic model","level":2,"score":0.49140000343322754},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.4740999937057495},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4440999925136566},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.43220001459121704},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.43140000104904175},{"id":"https://openalex.org/C177937566","wikidata":"https://www.wikidata.org/wiki/Q4223102","display_name":"Document clustering","level":3,"score":0.41359999775886536},{"id":"https://openalex.org/C170133592","wikidata":"https://www.wikidata.org/wiki/Q1806883","display_name":"Latent semantic analysis","level":2,"score":0.3711000084877014},{"id":"https://openalex.org/C2474386","wikidata":"https://www.wikidata.org/wiki/Q461183","display_name":"Text corpus","level":2,"score":0.36090001463890076},{"id":"https://openalex.org/C111030470","wikidata":"https://www.wikidata.org/wiki/Q1430460","display_name":"Curse of dimensionality","level":2,"score":0.3418000042438507},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3301999866962433},{"id":"https://openalex.org/C112933361","wikidata":"https://www.wikidata.org/wiki/Q2845258","display_name":"Probabilistic latent semantic analysis","level":2,"score":0.3296000063419342},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.3237999975681305},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.313400000333786},{"id":"https://openalex.org/C89686163","wikidata":"https://www.wikidata.org/wiki/Q1187982","display_name":"Vector space model","level":2,"score":0.3066999912261963},{"id":"https://openalex.org/C2778109090","wikidata":"https://www.wikidata.org/wiki/Q7781195","display_name":"Thematic structure","level":2,"score":0.30559998750686646},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.28999999165534973},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.27639999985694885},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.27379998564720154},{"id":"https://openalex.org/C39920170","wikidata":"https://www.wikidata.org/wiki/Q693083","display_name":"Soundness","level":2,"score":0.27250000834465027},{"id":"https://openalex.org/C7797323","wikidata":"https://www.wikidata.org/wiki/Q3798612","display_name":"Pointwise mutual information","level":3,"score":0.2667999863624573},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.26269999146461487},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.2612000107765198},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.2563000023365021},{"id":"https://openalex.org/C70518039","wikidata":"https://www.wikidata.org/wiki/Q16000077","display_name":"Dimensionality reduction","level":2,"score":0.2556000053882599},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.2540000081062317},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.2533000111579895}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.00891","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.00891","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.00891","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.00891","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Retrieval-augmented":[0],"generation":[1],"(RAG)":[2],"systems":[3],"rely":[4],"on":[5,109],"accurate":[6],"document":[7],"retrieval":[8,19,97,120],"to":[9,69,104],"ground":[10],"large":[11],"language":[12],"models":[13],"(LLMs)":[14],"in":[15,23,116],"external":[16],"knowledge,":[17],"yet":[18],"quality":[20],"often":[21],"degrades":[22],"corpora":[24],"where":[25],"topics":[26],"overlap":[27],"and":[28,42,56,64,74,88,99,119],"thematic":[29],"variation":[30],"is":[31],"high.":[32],"This":[33],"work":[34],"proposes":[35],"topic-enriched":[36,91,124],"embeddings":[37,92,125],"that":[38,123],"integrate":[39],"term-based":[40],"signals":[41],"topic":[43,54],"structure":[44],"with":[45,53,78],"contextual":[46,81,106],"sentence":[47],"embeddings.":[48],"The":[49],"approach":[50],"combines":[51],"TF-IDF":[52],"modeling":[55],"dimensionality":[57],"reduction,":[58],"using":[59],"Latent":[60,65],"Semantic":[61],"Analysis":[62],"(LSA)":[63],"Dirichlet":[66],"Allocation":[67],"(LDA)":[68],"encode":[70],"latent":[71],"topical":[72],"organization,":[73],"fuses":[75],"these":[76],"representations":[77],"a":[79,110,129],"compact":[80],"encoder":[82],"(all-MiniLM).":[83],"By":[84],"jointly":[85],"capturing":[86],"term-level":[87],"topic-level":[89],"semantics,":[90],"improve":[93],"semantic":[94],"clustering,":[95],"increase":[96],"precision,":[98],"reduce":[100],"computational":[101],"burden":[102],"relative":[103],"purely":[105],"baselines.":[107],"Experiments":[108],"legal-text":[111],"corpus":[112],"show":[113],"consistent":[114],"gains":[115],"clustering":[117],"coherence":[118],"metrics,":[121],"suggesting":[122],"can":[126],"serve":[127],"as":[128],"practical":[130],"component":[131],"for":[132],"more":[133],"reliable":[134],"knowledge-intensive":[135],"RAG":[136],"pipelines.":[137]},"counts_by_year":[],"updated_date":"2026-01-08T20:10:11.968330","created_date":"2026-01-08T00:00:00"}
