{"id":"https://openalex.org/W7130698851","doi":"https://doi.org/10.48550/arxiv.2602.16974","title":"Beyond Chunk-Then-Embed: A Comprehensive Taxonomy and Evaluation of Document Chunking Strategies for Information Retrieval","display_name":"Beyond Chunk-Then-Embed: A Comprehensive Taxonomy and Evaluation of Document Chunking Strategies for Information Retrieval","publication_year":2026,"publication_date":"2026-02-19","ids":{"openalex":"https://openalex.org/W7130698851","doi":"https://doi.org/10.48550/arxiv.2602.16974"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.16974","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Zhou, Yongjie","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhou, Yongjie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126498237","display_name":"Shuai Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Shuai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126522991","display_name":"Bevan Koopman","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Koopman, Bevan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124948006","display_name":"Guido Zuccon","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zuccon, Guido","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.8727999925613403,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.8727999925613403,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.030899999663233757,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.024000000208616257,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/chunking","display_name":"Chunking (psychology)","score":0.8414999842643738},{"id":"https://openalex.org/keywords/preprocessor","display_name":"Preprocessor","score":0.5282999873161316},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.48159998655319214},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.4805000126361847},{"id":"https://openalex.org/keywords/precision-and-recall","display_name":"Precision and recall","score":0.41449999809265137},{"id":"https://openalex.org/keywords/document-retrieval","display_name":"Document retrieval","score":0.32409998774528503}],"concepts":[{"id":"https://openalex.org/C203357204","wikidata":"https://www.wikidata.org/wiki/Q1089605","display_name":"Chunking (psychology)","level":2,"score":0.8414999842643738},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7732999920845032},{"id":"https://openalex.org/C34736171","wikidata":"https://www.wikidata.org/wiki/Q918333","display_name":"Preprocessor","level":2,"score":0.5282999873161316},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.48159998655319214},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4805000126361847},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46000000834465027},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.45829999446868896},{"id":"https://openalex.org/C81669768","wikidata":"https://www.wikidata.org/wiki/Q2359161","display_name":"Precision and recall","level":2,"score":0.41449999809265137},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3465999960899353},{"id":"https://openalex.org/C161156560","wikidata":"https://www.wikidata.org/wiki/Q1638872","display_name":"Document retrieval","level":2,"score":0.32409998774528503},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3077000081539154},{"id":"https://openalex.org/C148524875","wikidata":"https://www.wikidata.org/wiki/Q6975395","display_name":"F1 score","level":2,"score":0.2870999872684479},{"id":"https://openalex.org/C58642233","wikidata":"https://www.wikidata.org/wiki/Q8269924","display_name":"Taxonomy (biology)","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2680000066757202},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.2676999866962433},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.265500009059906}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.16974","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.16974","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.16974","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.16974","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Document":[0],"chunking":[1,16,73,113,118,153,174],"is":[2],"a":[3,76],"critical":[4],"preprocessing":[5],"step":[6],"in":[7,71,127,133],"dense":[8],"retrieval":[9,130,137,141,145],"systems,":[10],"yet":[11],"the":[12,110],"design":[13],"space":[14],"of":[15,112],"strategies":[17,82,154],"remains":[18],"poorly":[19],"understood.":[20],"Recent":[21],"research":[22],"has":[23],"proposed":[24],"several":[25],"concurrent":[26],"approaches,":[27],"including":[28,90],"LLM-guided":[29,102,161],"methods":[30,51,92,159],"(e.g.,":[31],"DenseX":[32],"and":[33,35,54,74,95,101,104,139,210],"LumberChunker)":[34],"contextualized":[36,120],"strategies(e.g.,":[37],"Late":[38],"Chunking),":[39],"which":[40,108],"generate":[41],"embeddings":[42],"before":[43],"segmentation":[44,88,198],"to":[45,115],"preserve":[46],"contextual":[47],"information.":[48],"However,":[49],"these":[50,125],"emerged":[52],"independently":[53],"were":[55],"evaluated":[56],"on":[57],"benchmarks":[58,212],"with":[59,190,194],"minimal":[60],"overlap,":[61],"making":[62],"direct":[63],"comparisons":[64],"difficult.":[65],"This":[66],"paper":[67],"reproduces":[68],"prior":[69],"studies":[70],"document":[72],"presents":[75],"systematic":[77],"framework":[78],"that":[79,151,185],"unifies":[80],"existing":[81],"along":[83],"two":[84,128],"key":[85],"dimensions:":[86],"(1)":[87],"methods,":[89],"structure-based":[91,158],"(fixed-size,":[93],"sentence-based,":[94],"paragraph-based)":[96],"as":[97,99],"well":[98],"semantically-informed":[100],"methods;":[103],"(2)":[105],"embedding":[106,116],"paradigms,":[107],"determine":[109],"timing":[111],"relative":[114],"(pre-embedding":[117],"vs.":[119],"chunking).":[121],"Our":[122,147,208],"reproduction":[123],"evaluates":[124],"approaches":[126],"distinct":[129],"settings":[131],"established":[132],"previous":[134],"work:":[135],"in-document":[136,171,180,191],"(needle-in-a-haystack)":[138],"in-corpus":[140,164,176,195],"(the":[142],"standard":[143],"information":[144],"task).":[146],"comprehensive":[148],"evaluation":[149,211],"reveals":[150],"optimal":[152],"are":[155,201,213],"task-dependent:":[156],"simple":[157],"outperform":[160],"alternatives":[162],"for":[163,170],"retrieval,":[165],"while":[166],"LumberChunker":[167],"performs":[168],"best":[169],"retrieval.":[172,181],"Contextualized":[173],"improves":[175],"effectiveness":[177],"but":[178,192],"degrades":[179],"We":[182],"also":[183],"find":[184],"chunk":[186,206],"size":[187],"correlates":[188],"moderately":[189],"weakly":[193],"effectiveness,":[196],"suggesting":[197],"method":[199],"differences":[200],"not":[202],"purely":[203],"driven":[204],"by":[205],"size.":[207],"code":[209],"publicly":[214],"available":[215],"at":[216],"(Anonymoused).":[217]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-21T00:00:00"}
