{"id":"https://openalex.org/W4393159294","doi":"https://doi.org/10.1609/aaai.v38i5.28212","title":"Improving Automatic VQA Evaluation Using Large Language Models","display_name":"Improving Automatic VQA Evaluation Using Large Language Models","publication_year":2024,"publication_date":"2024-03-24","ids":{"openalex":"https://openalex.org/W4393159294","doi":"https://doi.org/10.1609/aaai.v38i5.28212"},"language":"en","primary_location":{"id":"doi:10.1609/aaai.v38i5.28212","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v38i5.28212","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/28212/28419","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://ojs.aaai.org/index.php/AAAI/article/download/28212/28419","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010595613","display_name":"Oscar Ma\u00f1as","orcid":null},"institutions":[{"id":"https://openalex.org/I4210164802","display_name":"Mila - Quebec Artificial Intelligence Institute","ror":"https://ror.org/05c22rx21","country_code":"CA","type":"facility","lineage":["https://openalex.org/I4210164802"]},{"id":"https://openalex.org/I70931966","display_name":"Universit\u00e9 de Montr\u00e9al","ror":"https://ror.org/0161xgx34","country_code":"CA","type":"education","lineage":["https://openalex.org/I70931966"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Oscar Ma\u00f1as","raw_affiliation_strings":["Mila - Quebec AI Institute\nUniversit\u00e9 de Montr\u00e9al"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Mila - Quebec AI Institute\nUniversit\u00e9 de Montr\u00e9al","institution_ids":["https://openalex.org/I4210164802","https://openalex.org/I70931966"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073456064","display_name":"Benno Krojer","orcid":null},"institutions":[{"id":"https://openalex.org/I4210164802","display_name":"Mila - Quebec Artificial Intelligence Institute","ror":"https://ror.org/05c22rx21","country_code":"CA","type":"facility","lineage":["https://openalex.org/I4210164802"]},{"id":"https://openalex.org/I5023651","display_name":"McGill University","ror":"https://ror.org/01pxwe438","country_code":"CA","type":"education","lineage":["https://openalex.org/I5023651"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Benno Krojer","raw_affiliation_strings":["Mila - Quebec AI Institute\nMcGill University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Mila - Quebec AI Institute\nMcGill University","institution_ids":["https://openalex.org/I4210164802","https://openalex.org/I5023651"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5063960231","display_name":"Aishwarya Agrawal","orcid":"https://orcid.org/0000-0002-8620-8077"},"institutions":[{"id":"https://openalex.org/I4210164802","display_name":"Mila - Quebec Artificial Intelligence Institute","ror":"https://ror.org/05c22rx21","country_code":"CA","type":"facility","lineage":["https://openalex.org/I4210164802"]},{"id":"https://openalex.org/I70931966","display_name":"Universit\u00e9 de Montr\u00e9al","ror":"https://ror.org/0161xgx34","country_code":"CA","type":"education","lineage":["https://openalex.org/I70931966"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Aishwarya Agrawal","raw_affiliation_strings":["Mila - Quebec AI Institute\nUniversit\u00e9 de Montr\u00e9al"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Mila - Quebec AI Institute\nUniversit\u00e9 de Montr\u00e9al","institution_ids":["https://openalex.org/I4210164802","https://openalex.org/I70931966"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5010595613"],"corresponding_institution_ids":["https://openalex.org/I4210164802","https://openalex.org/I70931966"],"apc_list":null,"apc_paid":null,"fwci":20.6796,"has_fulltext":true,"cited_by_count":30,"citation_normalized_percentile":{"value":1.0,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":"38","issue":"5","first_page":"4171","last_page":"4179"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10809","display_name":"Occupational Health and Safety Research","score":0.756600022315979,"subfield":{"id":"https://openalex.org/subfields/3614","display_name":"Radiological and Ultrasound Technology"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T10809","display_name":"Occupational Health and Safety Research","score":0.756600022315979,"subfield":{"id":"https://openalex.org/subfields/3614","display_name":"Radiological and Ultrasound Technology"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T12945","display_name":"Quality Function Deployment in Product Design","score":0.7156999707221985,"subfield":{"id":"https://openalex.org/subfields/1405","display_name":"Management of Technology and Innovation"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11357","display_name":"Risk and Safety Analysis","score":0.6280999779701233,"subfield":{"id":"https://openalex.org/subfields/1804","display_name":"Statistics, Probability and Uncertainty"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5414782166481018},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4826379418373108},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4047413468360901}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5414782166481018},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4826379418373108},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4047413468360901}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v38i5.28212","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v38i5.28212","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/28212/28419","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v38i5.28212","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v38i5.28212","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/28212/28419","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.6399999856948853,"display_name":"Quality Education"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320309480","display_name":"Nvidia","ror":"https://ror.org/03jdj4y14"},{"id":"https://openalex.org/F4320309949","display_name":"Canadian Institute for Advanced Research","ror":"https://ror.org/01sdtdd95"},{"id":"https://openalex.org/F4320332195","display_name":"Samsung","ror":"https://ror.org/04w3jy968"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4393159294.pdf","grobid_xml":"https://content.openalex.org/works/W4393159294.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W2382290278","https://openalex.org/W2478288626","https://openalex.org/W4391913857","https://openalex.org/W2350741829","https://openalex.org/W3204019825"],"abstract_inverted_index":{"8":[0],"years":[1],"after":[2],"the":[3,13,27,49,59,91,116,122,136,166,170,177],"visual":[4],"question":[5],"answering":[6],"(VQA)":[7],"task":[8,114],"was":[9],"proposed,":[10],"accuracy":[11,123],"remains":[12],"primary":[14],"metric":[15,53,138,160],"for":[16,81],"automatic":[17,73],"evaluation.":[18,44],"VQA":[19,51,62,74,105,109,150,171],"Accuracy":[20,52],"has":[21],"been":[22],"effective":[23],"so":[24],"far":[25],"in":[26],"IID":[28],"evaluation":[29,110,178],"setting.":[30],"However,":[31],"our":[32,159],"community":[33],"is":[34,54,66,118],"undergoing":[35],"a":[36,67,79,103,125,129],"shift":[37],"towards":[38],"open-ended":[39],"generative":[40],"models":[41,99,151],"and":[42,57,152,180],"OOD":[43],"In":[45,84],"this":[46,85],"new":[47],"paradigm,":[48],"existing":[50,146],"overly":[55],"stringent":[56],"underestimates":[58],"performance":[60],"of":[61,95,124,131,158],"systems.":[63],"Thus,":[64],"there":[65],"need":[68],"to":[69,89,101,120,145,163,175],"develop":[70],"more":[71],"robust":[72],"metrics":[75,147],"that":[76],"serve":[77],"as":[78,111],"proxy":[80],"human":[82,142,182],"judgment.":[83],"work,":[86],"we":[87],"propose":[88],"leverage":[90],"in-context":[92],"learning":[93],"capabilities":[94],"instruction-tuned":[96],"large":[97],"language":[98],"(LLMs)":[100],"build":[102],"better":[104,139,164],"metric.":[106],"We":[107,134,154,173],"formulate":[108],"an":[112],"answer-rating":[113],"where":[115],"LLM":[117],"instructed":[119],"score":[121],"candidate":[126],"answer":[127],"given":[128],"set":[130],"reference":[132],"answers.":[133],"demonstrate":[135],"proposed":[137],"correlates":[140],"with":[141],"judgment":[143],"compared":[144],"across":[148],"several":[149],"benchmarks.":[153],"hope":[155],"wide":[156],"adoption":[157],"will":[161],"contribute":[162],"estimating":[165],"research":[167],"progress":[168],"on":[169],"task.":[172],"plan":[174],"release":[176],"code":[179],"collected":[181],"judgments.":[183]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":21},{"year":2024,"cited_by_count":5}],"updated_date":"2026-05-07T13:39:58.223016","created_date":"2025-10-10T00:00:00"}
