{"id":"https://openalex.org/W7155061653","doi":"https://doi.org/10.48550/arxiv.2604.17488","title":"AutoVQA-G: Self-Improving Agentic Framework for Automated Visual Question Answering and Grounding Annotation","display_name":"AutoVQA-G: Self-Improving Agentic Framework for Automated Visual Question Answering and Grounding Annotation","publication_year":2026,"publication_date":"2026-04-19","ids":{"openalex":"https://openalex.org/W7155061653","doi":"https://doi.org/10.48550/arxiv.2604.17488"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.17488","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.17488","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.17488","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5066365804","display_name":"Rongsheng Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hu, Rongsheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134140213","display_name":"Runwei Guan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guan, Runwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041344406","display_name":"Yicheng Di","orcid":"https://orcid.org/0000-0003-3802-2080"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Di, Yicheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101539801","display_name":"Jiayu Bao","orcid":"https://orcid.org/0009-0004-6690-427X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bao, Jiayu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134116489","display_name":"Yuan Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yuan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5066365804"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9929999709129333,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9929999709129333,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.002199999988079071,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0010999999940395355,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.6547999978065491},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.5936999917030334},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5874999761581421},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.5073000192642212},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.5059000253677368},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.45320001244544983},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.44620001316070557}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7609000205993652},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.6547999978065491},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.5936999917030334},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5906000137329102},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5874999761581421},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.5073000192642212},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.5059000253677368},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.45320001244544983},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.44679999351501465},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.44620001316070557},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4115000069141388},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3628999888896942},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.31209999322891235},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3034000098705292},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.2971999943256378},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.28929999470710754},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2694000005722046},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2646999955177307},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.26330000162124634},{"id":"https://openalex.org/C172367668","wikidata":"https://www.wikidata.org/wiki/Q6504956","display_name":"Data visualization","level":3,"score":0.2621000111103058},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.17488","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.17488","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.17488","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.17488","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Manual":[0],"annotation":[1],"of":[2],"high-quality":[3],"visual":[4,13,86,117],"question":[5],"answering":[6],"with":[7,15,115],"grounding":[8,118],"(VQA-G)":[9],"datasets,":[10],"which":[11],"pair":[12],"questions":[14],"evidential":[16],"grounding,":[17],"is":[18],"crucial":[19],"for":[20,65,84,129],"advancing":[21],"vision-language":[22],"models":[23],"(VLMs),":[24],"but":[25],"remains":[26],"unscalable.":[27],"Existing":[28],"automated":[29,66],"methods":[30],"are":[31],"often":[32],"hindered":[33],"by":[34],"two":[35],"key":[36],"issues:":[37],"(1)":[38],"inconsistent":[39],"data":[40,132],"fidelity":[41],"due":[42],"to":[43,102,121,133],"model":[44],"hallucinations;":[45],"(2)":[46],"brittle":[47],"verification":[48],"mechanisms":[49],"based":[50],"on":[51,89],"simple":[52],"heuristics.":[53],"To":[54],"address":[55],"these":[56],"limitations,":[57],"we":[58],"introduce":[59],"AutoVQA-G,":[60],"a":[61,76,92,126],"self-improving":[62],"agentic":[63],"framework":[64],"VQA-G":[67,113],"annotation.":[68],"AutoVQA-G":[69,111],"employs":[70],"an":[71],"iterative":[72],"refinement":[73],"loop":[74],"where":[75],"Consistency":[77],"Evaluation":[78],"module":[79],"uses":[80],"Chain-of-Thought":[81],"(CoT)":[82],"reasoning":[83],"fine-grained":[85],"verification.":[87],"Based":[88],"this":[90],"feedback,":[91],"memory-augmented":[93],"Prompt":[94],"Optimization":[95],"agent":[96],"analyzes":[97],"critiques":[98],"from":[99],"failed":[100],"samples":[101],"progressively":[103],"refine":[104],"generation":[105],"prompts.":[106],"Our":[107],"experiments":[108],"show":[109],"that":[110],"generates":[112],"datasets":[114],"superior":[116],"accuracy":[119],"compared":[120],"leading":[122],"multimodal":[123],"LLMs,":[124],"offering":[125],"promising":[127],"approach":[128],"creating":[130],"high-fidelity":[131],"facilitate":[134],"more":[135],"robust":[136],"VLM":[137],"training":[138],"and":[139],"evaluation.":[140],"Code:":[141],"https://github.com/rohnson1999/AutoVQA-G":[142]},"counts_by_year":[],"updated_date":"2026-04-22T06:07:44.442478","created_date":"2026-04-22T00:00:00"}
