{"id":"https://openalex.org/W7154397759","doi":"https://doi.org/10.48550/arxiv.2604.10167","title":"Visual Late Chunking: An Empirical Study of Contextual Chunking for Efficient Visual Document Retrieval","display_name":"Visual Late Chunking: An Empirical Study of Contextual Chunking for Efficient Visual Document Retrieval","publication_year":2026,"publication_date":"2026-04-11","ids":{"openalex":"https://openalex.org/W7154397759","doi":"https://doi.org/10.48550/arxiv.2604.10167"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.10167","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10167","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.10167","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133612271","display_name":"Yibo Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yan, Yibo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007900661","display_name":"Mingdong Ou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ou, Mingdong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133564906","display_name":"Yi Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Yi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133607178","display_name":"Jiahao Huo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huo, Jiahao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133581783","display_name":"Xin Zou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zou, Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133570921","display_name":"Shuliang Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Shuliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133601424","display_name":"James Kwok","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kwok, James","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133564926","display_name":"Xuming Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Xuming","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5133612271"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.37209999561309814,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.37209999561309814,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.2513999938964844,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.055399999022483826,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5333999991416931},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.49380001425743103},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.4641000032424927},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.44609999656677246},{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.43299999833106995},{"id":"https://openalex.org/keywords/empirical-research","display_name":"Empirical research","score":0.43149998784065247},{"id":"https://openalex.org/keywords/chunking","display_name":"Chunking (psychology)","score":0.40639999508857727},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.38199999928474426}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8288000226020813},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5931000113487244},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5333999991416931},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.49380001425743103},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.4641000032424927},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.45500001311302185},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.44609999656677246},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.43299999833106995},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.43149998784065247},{"id":"https://openalex.org/C203357204","wikidata":"https://www.wikidata.org/wiki/Q1089605","display_name":"Chunking (psychology)","level":2,"score":0.40639999508857727},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.38199999928474426},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3718999922275543},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.31839999556541443},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.30379998683929443},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.29670000076293945},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.2946999967098236},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.28850001096725464},{"id":"https://openalex.org/C161156560","wikidata":"https://www.wikidata.org/wiki/Q1638872","display_name":"Document retrieval","level":2,"score":0.28700000047683716},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.2833000123500824},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.26579999923706055},{"id":"https://openalex.org/C189391414","wikidata":"https://www.wikidata.org/wiki/Q7936579","display_name":"Visual Word","level":4,"score":0.263700008392334},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.2623000144958496},{"id":"https://openalex.org/C59732488","wikidata":"https://www.wikidata.org/wiki/Q2528440","display_name":"Visual analytics","level":3,"score":0.251800000667572}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.10167","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10167","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.10167","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10167","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multi-vector":[0],"models":[1],"dominate":[2],"Visual":[3],"Document":[4],"Retrieval":[5],"(VDR)":[6],"due":[7],"to":[8,24,41,65],"their":[9,14],"fine-grained":[10],"matching":[11],"capabilities,":[12],"but":[13],"high":[15],"storage":[16,100],"and":[17,124],"computational":[18],"costs":[19],"present":[20],"a":[21,33,61,74,96,105,117],"major":[22],"barrier":[23],"practical":[25,118],"deployment.":[26],"In":[27],"this":[28],"paper,":[29],"we":[30],"propose":[31],"ColChunk,":[32],"plug-and-play":[34],"framework":[35],"that":[36,77],"introduces":[37],"multimodal":[38],"late":[39],"chunking":[40],"construct":[42],"efficient,":[43],"contextualized":[44],"multi-vectors.":[45],"Unlike":[46],"existing":[47],"pruning":[48],"or":[49],"fixed-token":[50],"approaches,":[51],"ColChunk":[52,93,115],"employs":[53],"hierarchical":[54],"clustering":[55],"on":[56],"patch-level":[57],"embeddings,":[58],"fused":[59],"with":[60],"2D":[62],"position":[63],"prior":[64],"ensure":[66],"spatial-semantic":[67],"coherence.":[68],"This":[69],"adaptive":[70],"grouping":[71],"allows":[72],"for":[73,120],"content-aware":[75],"representation":[76],"preserves":[78],"global":[79],"context":[80],"while":[81,102],"drastically":[82],"reducing":[83],"the":[84],"vector":[85],"count.":[86],"Evaluations":[87],"across":[88,111],"24":[89],"VDR":[90],"datasets":[91],"demonstrate":[92],"achieves":[94],"over":[95],"90%":[97],"reduction":[98],"in":[99,109,126],"requirements":[101],"simultaneously":[103],"delivering":[104],"9-point":[106],"average":[107],"improvement":[108],"nDCG@5":[110],"representative":[112],"single-vector":[113],"models.":[114],"provides":[116],"solution":[119],"balancing":[121],"retrieval":[122],"accuracy":[123],"efficiency":[125],"visual":[127],"document":[128],"systems.":[129]},"counts_by_year":[],"updated_date":"2026-04-15T06:04:33.058270","created_date":"2026-04-15T00:00:00"}
