{"id":"https://openalex.org/W7133320449","doi":"https://doi.org/10.48550/arxiv.2603.01666","title":"Beyond the Grid: Layout-Informed Multi-Vector Retrieval with Parsed Visual Document Representations","display_name":"Beyond the Grid: Layout-Informed Multi-Vector Retrieval with Parsed Visual Document Representations","publication_year":2026,"publication_date":"2026-03-02","ids":{"openalex":"https://openalex.org/W7133320449","doi":"https://doi.org/10.48550/arxiv.2603.01666"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.01666","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01666","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.01666","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5127993828","display_name":"Yibo Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yan, Yibo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007900661","display_name":"Mingdong Ou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ou, Mingdong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127911040","display_name":"Yi Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Yi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127951899","display_name":"Xin Zou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zou, Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127920818","display_name":"Shuliang Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Shuliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080566405","display_name":"Jiahao Huo","orcid":"https://orcid.org/0000-0001-6686-2576"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huo, Jiahao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127939420","display_name":"Yu Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127981767","display_name":"James Kwok","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kwok, James","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5127911057","display_name":"Xuming Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Xuming","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5127993828"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.4641999900341034,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.4641999900341034,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.38769999146461487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.04439999908208847,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.7156999707221985},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.567799985408783},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5371000170707703},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5138000249862671},{"id":"https://openalex.org/keywords/document-retrieval","display_name":"Document retrieval","score":0.35010001063346863},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.3481000065803528},{"id":"https://openalex.org/keywords/core","display_name":"Core (optical fiber)","score":0.3345000147819519},{"id":"https://openalex.org/keywords/data-retrieval","display_name":"Data retrieval","score":0.3273000121116638}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8026999831199646},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.7156999707221985},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.567799985408783},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5371000170707703},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5138000249862671},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4968999922275543},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.46149998903274536},{"id":"https://openalex.org/C161156560","wikidata":"https://www.wikidata.org/wiki/Q1638872","display_name":"Document retrieval","level":2,"score":0.35010001063346863},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.3481000065803528},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.3345000147819519},{"id":"https://openalex.org/C551230270","wikidata":"https://www.wikidata.org/wiki/Q4368942","display_name":"Data retrieval","level":2,"score":0.3273000121116638},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.3255000114440918},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.3244999945163727},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.3190999925136566},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3075999915599823},{"id":"https://openalex.org/C197115733","wikidata":"https://www.wikidata.org/wiki/Q1003136","display_name":"Forcing (mathematics)","level":2,"score":0.30239999294281006},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2987000048160553},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.29409998655319214},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.27320000529289246},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.2696000039577484},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2662999927997589},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.26350000500679016},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2551000118255615}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.01666","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01666","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.01666","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01666","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Harnessing":[0],"the":[1,129,133,140],"full":[2],"potential":[3],"of":[4,81,136,143],"visually-rich":[5],"documents":[6],"requires":[7],"retrieval":[8,138],"systems":[9],"that":[10,37,70,105],"understand":[11],"not":[12],"just":[13],"text,":[14],"but":[15],"intricate":[16],"layouts,":[17],"a":[18,33,67,72,78,90,96,147],"core":[19],"challenge":[20],"in":[21],"Visual":[22],"Document":[23],"Retrieval":[24],"(VDR).":[25],"The":[26],"prevailing":[27],"multi-vector":[28,100,137],"architectures,":[29],"while":[30,114],"powerful,":[31],"face":[32],"crucial":[34],"storage":[35,109],"bottleneck":[36],"current":[38],"optimization":[39],"strategies,":[40],"such":[41],"as":[42],"embedding":[43],"merging,":[44],"pruning,":[45],"or":[46,56],"using":[47],"abstract":[48],"tokens,":[49],"fail":[50],"to":[51,76,94],"resolve":[52],"without":[53],"compromising":[54],"performance":[55,118],"ignoring":[57],"vital":[58],"layout":[59],"cues.":[60],"To":[61],"address":[62],"this,":[63],"we":[64],"introduce":[65],"ColParse,":[66],"novel":[68],"paradigm":[69],"leverages":[71],"document":[73],"parsing":[74],"model":[75],"generate":[77],"small":[79],"set":[80],"layout-informed":[82],"sub-image":[83],"embeddings,":[84],"which":[85],"are":[86],"then":[87],"fused":[88],"with":[89],"global":[91],"page-level":[92],"vector":[93],"create":[95],"compact":[97],"and":[98,123,139,152],"structurally-aware":[99],"representation.":[101],"Extensive":[102],"experiments":[103],"demonstrate":[104],"our":[106],"method":[107],"reduces":[108],"requirements":[110],"by":[111],"over":[112],"95%":[113],"simultaneously":[115],"yielding":[116],"significant":[117],"gains":[119],"across":[120],"numerous":[121],"benchmarks":[122],"base":[124],"models.":[125],"ColParse":[126],"thus":[127],"bridges":[128],"critical":[130],"gap":[131],"between":[132],"fine-grained":[134],"accuracy":[135],"practical":[141],"demands":[142],"large-scale":[144],"deployment,":[145],"offering":[146],"new":[148],"path":[149],"towards":[150],"efficient":[151],"interpretable":[153],"multimodal":[154],"information":[155],"systems.":[156]},"counts_by_year":[],"updated_date":"2026-03-04T07:09:34.246503","created_date":"2026-03-04T00:00:00"}
