{"id":"https://openalex.org/W7160534462","doi":"https://doi.org/10.48550/arxiv.2605.04304","title":"Hierarchical Visual Agent: Managing Contexts in Joint Image-Text Space for Advanced Chart Reasoning","display_name":"Hierarchical Visual Agent: Managing Contexts in Joint Image-Text Space for Advanced Chart Reasoning","publication_year":2026,"publication_date":"2026-05-05","ids":{"openalex":"https://openalex.org/W7160534462","doi":"https://doi.org/10.48550/arxiv.2605.04304"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.04304","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.04304","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.04304","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135617108","display_name":"Qihua Dong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Qihua","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050905744","display_name":"Ruozhen He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Ruozhen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135629070","display_name":"Junwen Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Junwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135561578","display_name":"Yizhou Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yizhou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135564740","display_name":"Xu Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Xu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132840923","display_name":"Songyao Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Songyao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135550310","display_name":"Yun Fu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Yun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9193000197410583,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9193000197410583,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.045499999076128006,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.012000000104308128,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.753000020980835},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6516000032424927},{"id":"https://openalex.org/keywords/chart","display_name":"Chart","score":0.5735999941825867},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.5580999851226807},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.4496999979019165},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.42719998955726624},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3968999981880188},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.36579999327659607},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.34779998660087585}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7903000116348267},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.753000020980835},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6516000032424927},{"id":"https://openalex.org/C190812933","wikidata":"https://www.wikidata.org/wiki/Q28923","display_name":"Chart","level":2,"score":0.5735999941825867},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.5580999851226807},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5462999939918518},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.4496999979019165},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.42719998955726624},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4124000072479248},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3968999981880188},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.36579999327659607},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.34779998660087585},{"id":"https://openalex.org/C207363949","wikidata":"https://www.wikidata.org/wiki/Q462915","display_name":"Visual space","level":3,"score":0.33489999175071716},{"id":"https://openalex.org/C2780103172","wikidata":"https://www.wikidata.org/wiki/Q1309721","display_name":"Visual Objects","level":3,"score":0.3151000142097473},{"id":"https://openalex.org/C124681953","wikidata":"https://www.wikidata.org/wiki/Q339062","display_name":"Decomposition","level":2,"score":0.31349998712539673},{"id":"https://openalex.org/C144986985","wikidata":"https://www.wikidata.org/wiki/Q871236","display_name":"Hierarchical database model","level":2,"score":0.3043000102043152},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.2944999933242798},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.28679999709129333},{"id":"https://openalex.org/C2777055276","wikidata":"https://www.wikidata.org/wiki/Q7936580","display_name":"Visual approach","level":2,"score":0.2768000066280365},{"id":"https://openalex.org/C97364631","wikidata":"https://www.wikidata.org/wiki/Q484284","display_name":"Deductive reasoning","level":2,"score":0.2759999930858612},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.2750000059604645},{"id":"https://openalex.org/C124527596","wikidata":"https://www.wikidata.org/wiki/Q17029359","display_name":"Hierarchical control system","level":3,"score":0.27160000801086426},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.26910001039505005},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.26159998774528503},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.2542000114917755}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.04304","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.04304","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.04304","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.04304","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Advanced":[0],"chart":[1,45],"question":[2],"answering":[3],"requires":[4],"both":[5],"precise":[6],"perception":[7],"of":[8],"small":[9],"visual":[10,41,90,101,124],"elements":[11],"and":[12,50,65,81,91,116,126],"multi-step":[13,31],"reasoning":[14,32,46,107],"across":[15,33],"several":[16],"subplots.":[17,35],"While":[18],"existing":[19],"MLLMs":[20],"are":[21],"strong":[22,113],"at":[23],"understanding":[24],"single":[25],"plots,":[26],"they":[27],"often":[28],"struggle":[29],"with":[30],"multiple":[34],"We":[36],"propose":[37],"HierVA,":[38],"a":[39,52,56,67,95],"hierarchical":[40,121],"agent":[42,87],"framework":[43],"for":[44],"that":[47,120],"iteratively":[48],"constructs":[49],"updates":[51],"working":[53],"context":[54,69,128],"in":[55],"joint":[57],"image--text":[58],"space.":[59],"A":[60],"high-level":[61],"manager":[62],"generates":[63],"plans":[64],"maintains":[66,88],"compact":[68],"containing":[70],"only":[71],"key":[72],"information,":[73],"while":[74],"specialized":[75],"workers":[76],"perform":[77],"reasoning,":[78],"gather":[79],"evidence,":[80],"return":[82],"results.":[83],"In":[84],"particular,":[85],"the":[86,100,105],"separate":[89],"textual":[92],"contexts,":[93],"using":[94],"zoom-in":[96],"tool":[97],"to":[98],"restrict":[99],"context.":[102],"Experiments":[103],"on":[104],"CharXiv":[106],"subset":[108],"demonstrate":[109],"consistent":[110],"improvements":[111],"over":[112],"multimodal":[114],"baselines,":[115],"ablation":[117],"studies":[118],"verify":[119],"architecture,":[122],"scoped":[123],"context,":[125],"distilled":[127],"contribute":[129],"complementary":[130],"gains.":[131]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-08T00:00:00"}
