{"id":"https://openalex.org/W7160278524","doi":"https://doi.org/10.1109/wacv61042.2026.00145","title":"You May Speak Freely: Improving the Fine-Grained Visual Recognition Capabilities of Multimodal Large Language Models with Answer Extraction","display_name":"You May Speak Freely: Improving the Fine-Grained Visual Recognition Capabilities of Multimodal Large Language Models with Answer Extraction","publication_year":2026,"publication_date":"2026-03-06","ids":{"openalex":"https://openalex.org/W7160278524","doi":"https://doi.org/10.1109/wacv61042.2026.00145"},"language":null,"primary_location":{"id":"doi:10.1109/wacv61042.2026.00145","is_oa":false,"landing_page_url":"https://doi.org/10.1109/wacv61042.2026.00145","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132644864","display_name":"Logan Lawrence","orcid":null},"institutions":[{"id":"https://openalex.org/I24603500","display_name":"University of Massachusetts Amherst","ror":"https://ror.org/0072zz521","country_code":"US","type":"education","lineage":["https://openalex.org/I24603500"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Logan Lawrence","raw_affiliation_strings":["University of Massachusetts,Amherst"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Massachusetts,Amherst","institution_ids":["https://openalex.org/I24603500"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036639026","display_name":"Oindrila Saha","orcid":null},"institutions":[{"id":"https://openalex.org/I24603500","display_name":"University of Massachusetts Amherst","ror":"https://ror.org/0072zz521","country_code":"US","type":"education","lineage":["https://openalex.org/I24603500"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Oindrila Saha","raw_affiliation_strings":["University of Massachusetts,Amherst"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Massachusetts,Amherst","institution_ids":["https://openalex.org/I24603500"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015773635","display_name":"Megan Wei","orcid":"https://orcid.org/0000-0002-1486-6409"},"institutions":[{"id":"https://openalex.org/I175594653","display_name":"John Brown University","ror":"https://ror.org/02ct41q97","country_code":"US","type":"education","lineage":["https://openalex.org/I175594653"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Megan Wei","raw_affiliation_strings":["Brown University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Brown University","institution_ids":["https://openalex.org/I175594653"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135340320","display_name":"Chen Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I175594653","display_name":"John Brown University","ror":"https://ror.org/02ct41q97","country_code":"US","type":"education","lineage":["https://openalex.org/I175594653"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chen Sun","raw_affiliation_strings":["Brown University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Brown University","institution_ids":["https://openalex.org/I175594653"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052551454","display_name":"Subhransu Maji","orcid":"https://orcid.org/0000-0002-3869-9334"},"institutions":[{"id":"https://openalex.org/I24603500","display_name":"University of Massachusetts Amherst","ror":"https://ror.org/0072zz521","country_code":"US","type":"education","lineage":["https://openalex.org/I24603500"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Subhransu Maji","raw_affiliation_strings":["University of Massachusetts,Amherst"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Massachusetts,Amherst","institution_ids":["https://openalex.org/I24603500"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5024685173","display_name":"Grant Van Horn","orcid":"https://orcid.org/0000-0003-2953-9651"},"institutions":[{"id":"https://openalex.org/I24603500","display_name":"University of Massachusetts Amherst","ror":"https://ror.org/0072zz521","country_code":"US","type":"education","lineage":["https://openalex.org/I24603500"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Grant Van Horn","raw_affiliation_strings":["University of Massachusetts,Amherst"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Massachusetts,Amherst","institution_ids":["https://openalex.org/I24603500"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.68692306,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1428","last_page":"1437"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9016000032424927,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9016000032424927,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.025800000876188278,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.01860000006854534,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/visual-language","display_name":"Visual language","score":0.3621000051498413},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3418999910354614},{"id":"https://openalex.org/keywords/information-extraction","display_name":"Information extraction","score":0.3334999978542328},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.32249999046325684},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.3068000078201294}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7037000060081482},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5519000291824341},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5295000076293945},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.3621000051498413},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3418999910354614},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.3334999978542328},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.32249999046325684},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3190999925136566},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.3068000078201294},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.30000001192092896},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2840000092983246},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.26919999718666077},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26109999418258667}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/wacv61042.2026.00145","is_oa":false,"landing_page_url":"https://doi.org/10.1109/wacv61042.2026.00145","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.508468508720398}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W12634471","https://openalex.org/W1954152232","https://openalex.org/W2081580037","https://openalex.org/W2117539524","https://openalex.org/W2138011018","https://openalex.org/W2533598788","https://openalex.org/W2797977484","https://openalex.org/W3160638507","https://openalex.org/W4390873312","https://openalex.org/W4394780772","https://openalex.org/W4402669754","https://openalex.org/W4402669926","https://openalex.org/W4403081466","https://openalex.org/W4404783537","https://openalex.org/W4404784287","https://openalex.org/W4412887922","https://openalex.org/W4415796274","https://openalex.org/W7133193597","https://openalex.org/W7133220561","https://openalex.org/W7133231369"],"related_works":[],"abstract_inverted_index":{"Despite":[0],"the":[1,10,18,67,72,101,120,126,138,147,150,192],"renewed":[2],"interest":[3],"in":[4,57,66,78,179,202],"zero-shot":[5],"visual":[6,175],"classification":[7,182],"due":[8],"to":[9,54,69,89,94,136,161],"rise":[11],"of":[12,20,24,49,149,172,181,197],"Multimodal":[13],"Large":[14],"Language":[15],"Models":[16],"(MLLMs),":[17],"problem":[19],"evaluating":[21,178],"free-form":[22],"responses":[23],"auto-regressive":[25],"models":[26],"remains":[27],"a":[28,113,170],"persistent":[29],"challenge.":[30],"Most":[31],"existing":[32],"works":[33],"focus":[34],"on":[35],"language-only":[36],"tasks":[37,56,201],"or":[38],"don\u2019t":[39],"consider":[40],"Multiple":[41],"Choice":[42],"Questions":[43],"(MCQs)":[44],"beyond":[45],"5-way":[46],"options,":[47],"both":[48],"which":[50,117],"are":[51,65,74],"critical":[52],"capabilities":[53],"solve":[55],"Fine-Grained":[58],"Visual":[59],"Classification":[60],"(FGVC)":[61],"where":[62,97],"choice":[63,92,102,155],"counts":[64],"hundreds":[68],"thousands":[70],"and":[71,183,185],"choices":[73],"highly":[75,80],"related.":[76],"Furthermore,":[77],"this":[79,108,188],"multi-way":[81],"MCQ":[82],"setting":[83],"it":[84],"is":[85,104],"not":[86],"clear":[87],"how":[88],"extend":[90],"LLM":[91],"extraction":[93],"retrieval-based":[95],"problems,":[96],"computing":[98],"probabilities":[99],"over":[100,169,191],"set":[103],"computationally":[105],"costly.":[106],"In":[107,142],"work":[109],"we":[110,145],"investigate":[111],"nlg2choice,":[112],"simple":[114],"two-stage":[115],"method":[116,160],"first":[118],"asks":[119],"MLLM":[121],"an":[122,157],"open-ended":[123],"question":[124],"for":[125],"task":[127],"with":[128,156],"minimal":[129],"constraints,":[130],"then":[131],"uses":[132],"text-only":[133],"constrained":[134,151],"decoding":[135],"predict":[137],"most":[139],"likely":[140],"choice.":[141],"retrieval":[143],"settings,":[144],"compute":[146],"probability":[148],"response":[152],"taking":[153],"that":[154,187,195],"early":[158],"stopping":[159],"significantly":[162],"improve":[163],"throughput.":[164],"Our":[165],"results":[166],"show":[167,186],"improvement":[168],"suite":[171],"seven":[173],"fine-grained":[174],"datasets":[176],"when":[177],"terms":[180],"retrieval,":[184],"performance":[189],"holds":[190],"various":[193],"ways":[194],"users":[196],"LLMs":[198],"can":[199],"implement":[200],"natural":[203],"language.":[204]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-06T00:00:00"}
