{"id":"https://openalex.org/W7130529623","doi":"https://doi.org/10.48550/arxiv.2602.16455","title":"Visual Self-Refine: A Pixel-Guided Paradigm for Accurate Chart Parsing","display_name":"Visual Self-Refine: A Pixel-Guided Paradigm for Accurate Chart Parsing","publication_year":2026,"publication_date":"2026-02-18","ids":{"openalex":"https://openalex.org/W7130529623","doi":"https://doi.org/10.48550/arxiv.2602.16455"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.16455","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126379463","display_name":"Jinsong Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Jinsong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055238399","display_name":"Xiaoyi Dong","orcid":"https://orcid.org/0000-0002-4654-835X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dong, Xiaoyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122852095","display_name":"Yuhang Zang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zang, Yuhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126429612","display_name":"Yuhang Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Yuhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126408850","display_name":"Jiaqi Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jiaqi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126370910","display_name":"Dahua Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Dahua","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5126379463"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8199999928474426,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8199999928474426,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.05689999833703041,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.02019999921321869,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.7599999904632568},{"id":"https://openalex.org/keywords/chart","display_name":"Chart","score":0.6144000291824341},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.583899974822998},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.48489999771118164},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.46160000562667847},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4408000111579895}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8434000015258789},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.7599999904632568},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6686000227928162},{"id":"https://openalex.org/C190812933","wikidata":"https://www.wikidata.org/wiki/Q28923","display_name":"Chart","level":2,"score":0.6144000291824341},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.583899974822998},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.48489999771118164},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.46160000562667847},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.45750001072883606},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4408000111579895},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.4366999864578247},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4336000084877014},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4036000072956085},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.31859999895095825},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.3057999908924103},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.27250000834465027},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.26019999384880066},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.25609999895095825}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.16455","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.16455","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.16455","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.16455","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.8872405886650085,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"Large":[1],"Vision-Language":[2],"Models":[3],"(LVLMs)":[4],"have":[5],"demonstrated":[6],"remarkable":[7],"capabilities":[8],"for":[9,22,196,215],"reasoning":[10],"and":[11,48,96,109,159,192],"self-correction":[12],"at":[13],"the":[14,52,119,123,134,151,175,181],"textual":[15],"level,":[16],"these":[17,99,166],"strengths":[18],"provide":[19],"minimal":[20],"benefits":[21],"complex":[23,68],"tasks":[24],"centered":[25],"on":[26,218],"visual":[27,114,147,171,207],"perception,":[28],"such":[29],"as":[30,59,169,204],"Chart":[31,126],"Parsing.":[32],"Existing":[33],"models":[34],"often":[35],"struggle":[36],"with":[37],"visually":[38],"dense":[39],"charts,":[40,69],"leading":[41],"to":[42,63,85,89,102,106,149,173],"errors":[43],"like":[44],"data":[45,155],"omission,":[46],"misalignment,":[47],"hallucination.":[49],"Inspired":[50],"by":[51,128],"human":[53],"strategy":[54],"of":[55,82,125,153,183,222],"using":[56],"a":[57,60,72,87,140,160,190,205,211,219],"finger":[58],"``visual":[61],"anchor''":[62],"ensure":[64,150],"accuracy":[65,152,217],"when":[66],"reading":[67],"we":[70,186],"propose":[71],"new":[73,191,213],"paradigm":[74,121],"named":[75],"Visual":[76],"Self-Refine":[77],"(VSR).":[78],"The":[79],"core":[80],"idea":[81],"VSR":[83,120,203],"is":[84],"enable":[86],"model":[88,132],"generate":[90],"pixel-level":[91],"localization":[92],"outputs,":[93],"visualize":[94],"them,":[95],"then":[97],"feed":[98],"visualizations":[100],"back":[101],"itself,":[103],"allowing":[104],"it":[105,144,164],"intuitively":[107],"inspect":[108],"correct":[110],"its":[111],"own":[112],"potential":[113],"perception":[115],"errors.":[116],"We":[117],"instantiate":[118],"in":[122],"domain":[124],"Parsing":[127],"proposing":[129],"ChartVSR.":[130],"This":[131],"decomposes":[133],"parsing":[135],"process":[136],"into":[137],"two":[138],"stages:":[139],"Refine":[141],"Stage,":[142,162],"where":[143,163],"iteratively":[145],"uses":[146,165],"feedback":[148,208],"all":[154],"points'":[156],"Pixel-level":[157],"Localizations,":[158],"Decode":[161],"verified":[167],"localizations":[168],"precise":[170],"anchors":[172],"parse":[174],"final":[176],"structured":[177],"data.":[178],"To":[179],"address":[180],"limitations":[182],"existing":[184],"benchmarks,":[185],"also":[187,201],"construct":[188],"ChartP-Bench,":[189],"highly":[193],"challenging":[194],"benchmark":[195],"chart":[197],"parsing.":[198],"Our":[199],"work":[200],"highlights":[202],"general-purpose":[206],"mechanism,":[209],"offering":[210],"promising":[212],"direction":[214],"enhancing":[216],"wide":[220],"range":[221],"vision-centric":[223],"tasks.":[224]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-20T00:00:00"}
