{"id":"https://openalex.org/W7134854352","doi":"https://doi.org/10.48550/arxiv.2603.07612","title":"KohakuRAG: A simple RAG framework with hierarchical document indexing","display_name":"KohakuRAG: A simple RAG framework with hierarchical document indexing","publication_year":2026,"publication_date":"2026-03-08","ids":{"openalex":"https://openalex.org/W7134854352","doi":"https://doi.org/10.48550/arxiv.2603.07612"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.07612","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128652028","display_name":"Shih-Ying Yeh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yeh, Shih-Ying","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113354552","display_name":"Yueh-Feng Ku","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ku, Yueh-Feng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040329608","display_name":"Ko-Wei Huang","orcid":"https://orcid.org/0000-0003-1489-540X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Ko-Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128631220","display_name":"Buu-Khang Tu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tu, Buu-Khang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.7200000286102295,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.7200000286102295,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.17080000042915344,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.026200000196695328,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5317999720573425},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.42410001158714294},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.42239999771118164},{"id":"https://openalex.org/keywords/document-retrieval","display_name":"Document retrieval","score":0.40790000557899475},{"id":"https://openalex.org/keywords/tree","display_name":"Tree (set theory)","score":0.4036000072956085},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.3698999881744385},{"id":"https://openalex.org/keywords/voting","display_name":"Voting","score":0.3474999964237213},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.3199000060558319}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7601000070571899},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.567799985408783},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5317999720573425},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.42410001158714294},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.42239999771118164},{"id":"https://openalex.org/C161156560","wikidata":"https://www.wikidata.org/wiki/Q1638872","display_name":"Document retrieval","level":2,"score":0.40790000557899475},{"id":"https://openalex.org/C113174947","wikidata":"https://www.wikidata.org/wiki/Q2859736","display_name":"Tree (set theory)","level":2,"score":0.4036000072956085},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.3698999881744385},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36160001158714294},{"id":"https://openalex.org/C520049643","wikidata":"https://www.wikidata.org/wiki/Q189760","display_name":"Voting","level":3,"score":0.3474999964237213},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3257000148296356},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.3199000060558319},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.3142000138759613},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.3052999973297119},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.29750001430511475},{"id":"https://openalex.org/C159254197","wikidata":"https://www.wikidata.org/wiki/Q1144915","display_name":"Lexicographical order","level":2,"score":0.29499998688697815},{"id":"https://openalex.org/C2779151265","wikidata":"https://www.wikidata.org/wiki/Q1156791","display_name":"Copying","level":2,"score":0.2856000065803528},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.2824000120162964},{"id":"https://openalex.org/C163797641","wikidata":"https://www.wikidata.org/wiki/Q2067937","display_name":"Tree structure","level":3,"score":0.2696000039577484},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2630000114440918},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2621999979019165},{"id":"https://openalex.org/C31170391","wikidata":"https://www.wikidata.org/wiki/Q188619","display_name":"Hierarchy","level":2,"score":0.2621000111103058},{"id":"https://openalex.org/C171686336","wikidata":"https://www.wikidata.org/wiki/Q3532085","display_name":"Topic model","level":2,"score":0.2574999928474426}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.07612","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.07612","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.07612","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.07612","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Retrieval-augmented":[0],"generation":[1],"(RAG)":[2],"systems":[3,103],"that":[4,38,53,148],"answer":[5,105],"questions":[6,107],"from":[7,108],"document":[8,22,55],"collections":[9],"face":[10],"compounding":[11],"difficulties":[12],"when":[13],"high-precision":[14],"citations":[15],"are":[16],"required:":[17],"flat":[18],"chunking":[19],"strategies":[20],"sacrifice":[21],"structure,":[23],"single-query":[24],"formulations":[25],"miss":[26],"relevant":[27],"passages":[28],"through":[29,57,76,87],"vocabulary":[30],"mismatch,":[31],"and":[32,43,84,115,126,156],"single-pass":[33],"inference":[34,89],"produces":[35],"stochastic":[36],"answers":[37,86],"vary":[39],"in":[40],"both":[41,124,142],"content":[42],"citation":[44],"selection.":[45],"We":[46,93,179],"present":[47],"KohakuRAG,":[48],"a":[49,58,100],"hierarchical":[50,167],"RAG":[51],"framework":[52],"preserves":[54],"structure":[56],"four-level":[59],"tree":[60],"representation":[61],"(document":[62],"$\\rightarrow$":[63,65,67],"section":[64],"paragraph":[66],"sentence)":[68],"with":[69,81,90,111,159],"bottom-up":[70],"embedding":[71],"aggregation,":[72],"improves":[73],"retrieval":[74,169],"coverage":[75],"an":[77],"LLM-powered":[78],"query":[79],"planner":[80],"cross-query":[82],"reranking,":[83],"stabilizes":[85],"ensemble":[88,157],"abstention-aware":[91],"voting.":[92],"evaluate":[94],"on":[95,123],"the":[96,133,138],"WattBot":[97],"2025":[98],"Challenge,":[99],"benchmark":[101],"requiring":[102],"to":[104,136],"technical":[106],"32":[109],"documents":[110],"$\\pm$0.1%":[112],"numeric":[113],"tolerance":[114],"exact":[116],"source":[117],"attribution.":[118],"KohakuRAG":[119,181],"achieves":[120],"first":[121],"place":[122],"public":[125],"private":[127],"leaderboards":[128],"(final":[129],"score":[130],"0.861),":[131],"as":[132,182],"only":[134,177],"team":[135],"maintain":[137],"top":[139],"position":[140],"across":[141],"evaluation":[143],"partitions.":[144],"Ablation":[145],"studies":[146],"reveal":[147],"prompt":[149],"ordering":[150],"(+80%":[151],"relative),":[152],"retry":[153],"mechanisms":[154],"(+69%),":[155],"voting":[158],"blank":[160],"filtering":[161],"(+1.2pp)":[162],"each":[163],"contribute":[164],"substantially,":[165],"while":[166],"dense":[168],"alone":[170],"matches":[171],"hybrid":[172],"sparse-dense":[173],"approaches":[174],"(BM25":[175],"adds":[176],"+3.1pp).":[178],"release":[180],"open-source":[183],"software":[184],"at":[185],"https://github.com/KohakuBlueleaf/KohakuRAG.":[186]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-11T00:00:00"}
