{"id":"https://openalex.org/W7129462205","doi":"https://doi.org/10.48550/arxiv.2602.14162","title":"Index Light, Reason Deep: Deferred Visual Ingestion for Visual-Dense Document Question Answering","display_name":"Index Light, Reason Deep: Deferred Visual Ingestion for Visual-Dense Document Question Answering","publication_year":2026,"publication_date":"2026-02-15","ids":{"openalex":"https://openalex.org/W7129462205","doi":"https://doi.org/10.48550/arxiv.2602.14162"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.14162","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126182284","display_name":"Tao Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xu, Tao","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5126182284"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5702999830245972,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5702999830245972,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.14480000734329224,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.14110000431537628,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.8242999911308289},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.7226999998092651},{"id":"https://openalex.org/keywords/document-retrieval","display_name":"Document retrieval","score":0.5623999834060669},{"id":"https://openalex.org/keywords/index","display_name":"Index (typography)","score":0.5249999761581421},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.4837999939918518},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4429999887943268},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.42980000376701355},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.42500001192092896},{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.3476000130176544},{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.34459999203681946}],"concepts":[{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.8242999911308289},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.7226999998092651},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6958000063896179},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6827999949455261},{"id":"https://openalex.org/C161156560","wikidata":"https://www.wikidata.org/wiki/Q1638872","display_name":"Document retrieval","level":2,"score":0.5623999834060669},{"id":"https://openalex.org/C2777382242","wikidata":"https://www.wikidata.org/wiki/Q6017816","display_name":"Index (typography)","level":2,"score":0.5249999761581421},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4959000051021576},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4837999939918518},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.44339999556541443},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4429999887943268},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.42980000376701355},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.42500001192092896},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.3476000130176544},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.34459999203681946},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.3393000066280365},{"id":"https://openalex.org/C89686163","wikidata":"https://www.wikidata.org/wiki/Q1187982","display_name":"Vector space model","level":2,"score":0.32030001282691956},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3073999881744385},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.29330000281333923},{"id":"https://openalex.org/C189391414","wikidata":"https://www.wikidata.org/wiki/Q7936579","display_name":"Visual Word","level":4,"score":0.2924000024795532},{"id":"https://openalex.org/C191072391","wikidata":"https://www.wikidata.org/wiki/Q17043235","display_name":"Retrievability","level":3,"score":0.2870999872684479},{"id":"https://openalex.org/C2780980858","wikidata":"https://www.wikidata.org/wiki/Q110022","display_name":"Dual (grammatical number)","level":2,"score":0.28619998693466187},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2842000126838684},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.28279998898506165},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.28060001134872437},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2791999876499176},{"id":"https://openalex.org/C2778496695","wikidata":"https://www.wikidata.org/wiki/Q254128","display_name":"Dilemma","level":2,"score":0.2750000059604645},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2702000141143799},{"id":"https://openalex.org/C44083865","wikidata":"https://www.wikidata.org/wiki/Q3853443","display_name":"Mean reciprocal rank","level":2,"score":0.2624000012874603},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.2551000118255615},{"id":"https://openalex.org/C130590232","wikidata":"https://www.wikidata.org/wiki/Q1671754","display_name":"Inverted index","level":3,"score":0.2549000084400177},{"id":"https://openalex.org/C165838908","wikidata":"https://www.wikidata.org/wiki/Q736777","display_name":"Calibration","level":2,"score":0.2526000142097473},{"id":"https://openalex.org/C2777601897","wikidata":"https://www.wikidata.org/wiki/Q3409113","display_name":"Presentation (obstetrics)","level":2,"score":0.25110000371932983},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.2500999867916107}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.14162","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.14162","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.14162","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.14162","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.6587537527084351,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Existing":[0],"multimodal":[1],"document":[2,88],"question":[3,127],"answering":[4],"methods":[5],"predominantly":[6],"adopt":[7],"a":[8,16,48,99,131,172,229],"Pre-Ingestion":[9],"(PI)":[10],"strategy:":[11],"during":[12,84,110],"the":[13,75,103,120,125,142,184,202,240,244,249],"indexing":[14,227],"phase,":[15],"Vision":[17],"Language":[18],"Model":[19],"(VLM)":[20],"is":[21,211],"called":[22],"on":[23,51,68,138,146,161,170,207,243],"every":[24],"page":[25,28],"to":[26,96,130,216],"generate":[27],"descriptions":[29,57],"that":[30,201,223,239],"are":[31,38,114,128],"then":[32],"encoded":[33],"into":[34],"vectors,":[35],"and":[36,63,119,233],"questions":[37],"answered":[39],"via":[40,116],"embedding":[41,64,205],"similarity":[42],"retrieval.":[43],"However,":[44],"this":[45],"approach":[46],"faces":[47],"dual":[49],"dilemma":[50],"visual-dense":[52],"engineering":[53,148,209],"documents:":[54],"VLM":[55,82,132,234],"blind":[56],"inevitably":[58],"lose":[59],"critical":[60],"visual":[61,192],"details,":[62],"retrieval":[65,177,193,206,231,245],"systematically":[66],"fails":[67],"highly":[69],"similar":[70],"documents.":[71],"This":[72],"paper":[73],"proposes":[74],"Deferred":[76],"Visual":[77],"Ingestion":[78],"(DVI)":[79],"framework:":[80],"zero":[81],"calls":[83],"preprocessing,":[85],"leveraging":[86],"only":[87,197],"structural":[89,212],"information":[90],"(table":[91],"of":[92,144,204],"contents,":[93],"drawing":[94],"numbers)":[95],"automatically":[97],"build":[98],"hierarchical":[100],"index":[101],"through":[102],"HDNC":[104,224],"(Hierarchical":[105],"Drawing":[106],"Number":[107],"Clustering)":[108],"algorithm;":[109],"inference,":[111],"candidate":[112],"pages":[113],"located":[115],"BM25":[117],"retrieval,":[118],"original":[121],"images":[122],"along":[123],"with":[124],"specific":[126],"sent":[129],"for":[133],"targeted":[134],"analysis.":[135],"Large-scale":[136],"experiments":[137],"three":[139],"datasets":[140],"validate":[141],"effectiveness":[143],"DVI:":[145],"Bridge":[147,185],"drawings":[149],"(1,323":[150],"questions),":[151,165,176],"end-to-end":[152],"QA":[153],"accuracy":[154],"reaches":[155],"65.6\\%":[156],"vs.":[157,167,181],"PI's":[158],"24.3\\%":[159],"(+41.3pp);":[160],"Steel":[162],"catalog":[163],"(186":[164],"30.6\\%":[166],"16.1\\%":[168],"(+14.5pp);":[169],"CircuitVQA,":[171],"public":[173],"benchmark":[174],"(9,315":[175],"ImgR@3":[178],"achieves":[179],"31.2\\%":[180],"0.7\\%.":[182],"On":[183],"dataset,":[186],"we":[187],"evaluated":[188],"ColPali":[189],"(ICLR":[190],"2025":[191],"SOTA),":[194],"which":[195],"achieved":[196],"20.1\\%":[198],"PageR@3,":[199],"demonstrating":[200],"failure":[203],"homogeneous":[208],"documents":[210],"rather":[213,247],"than":[214,248],"due":[215],"insufficient":[217],"model":[218],"capability.":[219],"Ablation":[220],"studies":[221],"show":[222],"zero-cost":[225],"automatic":[226],"yields":[228],"+27.5pp":[230],"improvement,":[232],"conversion":[235],"rate":[236],"analysis":[237],"confirms":[238],"bottleneck":[241],"lies":[242],"side":[246],"comprehension":[250],"side.":[251]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-18T00:00:00"}
