{"id":"https://openalex.org/W7162576683","doi":"https://doi.org/10.48550/arxiv.2605.27298","title":"Self-Ensembling Vision-Language Models for Chart Data Extraction","display_name":"Self-Ensembling Vision-Language Models for Chart Data Extraction","publication_year":2026,"publication_date":"2026-05-26","ids":{"openalex":"https://openalex.org/W7162576683","doi":"https://doi.org/10.48550/arxiv.2605.27298"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.27298","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27298","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.27298","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137118947","display_name":"Thomas Berkane","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Berkane, Thomas","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101930762","display_name":"Qianyi Wang","orcid":"https://orcid.org/0000-0001-9697-6865"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Qianyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130390492","display_name":"Maimuna S. Majumder","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Majumder, Maimuna S.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.4277999997138977,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.4277999997138977,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.050999999046325684,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.03669999912381172,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/table","display_name":"Table (database)","score":0.5781999826431274},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5666999816894531},{"id":"https://openalex.org/keywords/chart","display_name":"Chart","score":0.5572999715805054},{"id":"https://openalex.org/keywords/data-extraction","display_name":"Data extraction","score":0.4514000117778778},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.44530001282691956},{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.40610000491142273},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.38690000772476196}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8034999966621399},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.6031000018119812},{"id":"https://openalex.org/C45235069","wikidata":"https://www.wikidata.org/wiki/Q278425","display_name":"Table (database)","level":2,"score":0.5781999826431274},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5666999816894531},{"id":"https://openalex.org/C190812933","wikidata":"https://www.wikidata.org/wiki/Q28923","display_name":"Chart","level":2,"score":0.5572999715805054},{"id":"https://openalex.org/C2777466982","wikidata":"https://www.wikidata.org/wiki/Q5227287","display_name":"Data extraction","level":3,"score":0.4514000117778778},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.44530001282691956},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.40610000491142273},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.38690000772476196},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3671000003814697},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.3452000021934509},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.34389999508857727},{"id":"https://openalex.org/C115901376","wikidata":"https://www.wikidata.org/wiki/Q184199","display_name":"Automation","level":2,"score":0.26919999718666077},{"id":"https://openalex.org/C4725764","wikidata":"https://www.wikidata.org/wiki/Q844704","display_name":"Extraction (chemistry)","level":2,"score":0.266400009393692},{"id":"https://openalex.org/C165838908","wikidata":"https://www.wikidata.org/wiki/Q736777","display_name":"Calibration","level":2,"score":0.2583000063896179},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.27298","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27298","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.27298","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27298","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Charts":[0],"effectively":[1],"convey":[2],"quantitative":[3],"information,":[4],"but":[5],"the":[6,63,75,109,171],"underlying":[7],"data":[8,151,205],"are":[9],"often":[10],"locked":[11],"in":[12,170,208],"image":[13,70],"form,":[14],"hindering":[15],"reuse":[16],"and":[17,24,71,85,113,155,177,214],"analysis.":[18],"Manually":[19],"digitizing":[20],"charts":[21,42,162],"is":[22],"time-consuming":[23],"error-prone,":[25],"motivating":[26],"automatic":[27],"chart-to-table":[28],"extraction.":[29],"Recent":[30],"approaches":[31],"use":[32],"specialized":[33],"vision-language":[34],"models":[35],"(VLMs),":[36],"yet":[37],"performance":[38],"still":[39],"lags":[40],"on":[41,117,159,194],"with":[43,136,152],"many":[44],"datapoints":[45,167],"or":[46],"substantial":[47],"stylistic":[48],"variation.":[49],"We":[50,81],"propose":[51],"a":[52,67,94,144],"VLM":[53,65,186],"self-ensembling":[54],"method":[55,100,201],"that":[56],"repeatedly":[57],"samples":[58,120],"multiple":[59],"tabular":[60,204],"outputs":[61],"from":[62,148],"same":[64],"for":[66,139],"fixed":[68],"chart":[69,129,209],"aggregates":[72],"them":[73],"at":[74],"level":[76],"of":[77],"individual":[78],"table":[79,111],"cells.":[80],"align":[82],"candidate":[83],"tables":[84],"take":[86],"per-cell":[87],"medians":[88],"over":[89,184],"numerical":[90],"values":[91],"to":[92,105,121,190],"produce":[93],"more":[95,153,166],"accurate":[96],"consensus":[97],"table.":[98],"Our":[99],"also":[101],"includes":[102],"convergence":[103],"detection":[104],"stop":[106],"sampling":[107],"once":[108],"aggregated":[110],"stabilizes,":[112],"uncertainty":[114],"estimation":[115],"based":[116],"dispersion":[118],"across":[119],"help":[122],"users":[123],"assess":[124],"extraction":[125,130,182],"reliability.":[126],"Because":[127],"existing":[128],"benchmarks":[131],"contain":[132,163],"relatively":[133],"simple":[134],"plots":[135],"limited":[137],"room":[138],"improvement,":[140],"we":[141],"introduce":[142],"WB-ChartExtract,":[143,178],"new":[145],"benchmark":[146],"built":[147],"World":[149],"Bank":[150],"complex":[154],"stylistically":[156],"diverse":[157],"charts;":[158],"average,":[160],"its":[161],"7":[164],"times":[165],"than":[168],"those":[169],"ChartQA":[172,176],"benchmark.":[173],"Across":[174],"both":[175],"our":[179,200],"approach":[180],"improves":[181],"accuracy":[183],"single-pass":[185],"outputs,":[187],"yielding":[188],"up":[189],"23%":[191],"relative":[192],"improvement":[193],"WB-ChartExtract":[195],"after":[196],"ensembling.":[197],"More":[198],"broadly,":[199],"helps":[202],"unlock":[203],"previously":[206],"siloed":[207],"images,":[210],"enabling":[211],"downstream":[212],"analysis":[213],"reuse.":[215]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-28T00:00:00"}
