{"id":"https://openalex.org/W4412889861","doi":"https://doi.org/10.18653/v1/2025.acl-long.1411","title":"Performance Gap in Entity Knowledge Extraction Across Modalities in Vision Language Models","display_name":"Performance Gap in Entity Knowledge Extraction Across Modalities in Vision Language Models","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4412889861","doi":"https://doi.org/10.18653/v1/2025.acl-long.1411"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2025.acl-long.1411","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-long.1411","pdf_url":"https://aclanthology.org/2025.acl-long.1411.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.acl-long.1411.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5087870256","display_name":"I. Bernard Cohen","orcid":"https://orcid.org/0000-0001-8142-9092"},"institutions":[{"id":"https://openalex.org/I16391192","display_name":"Tel Aviv University","ror":"https://ror.org/04mhzgx49","country_code":"IL","type":"education","lineage":["https://openalex.org/I16391192"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Ido Cohen","raw_affiliation_strings":["Tel Aviv University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tel Aviv University","institution_ids":["https://openalex.org/I16391192"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5099282585","display_name":"Daniela Gottesman","orcid":null},"institutions":[{"id":"https://openalex.org/I16391192","display_name":"Tel Aviv University","ror":"https://ror.org/04mhzgx49","country_code":"IL","type":"education","lineage":["https://openalex.org/I16391192"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Daniela Gottesman","raw_affiliation_strings":["Tel Aviv University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tel Aviv University","institution_ids":["https://openalex.org/I16391192"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065717258","display_name":"Mor Geva","orcid":"https://orcid.org/0000-0001-9529-6315"},"institutions":[{"id":"https://openalex.org/I16391192","display_name":"Tel Aviv University","ror":"https://ror.org/04mhzgx49","country_code":"IL","type":"education","lineage":["https://openalex.org/I16391192"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Mor Geva","raw_affiliation_strings":["Tel Aviv University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tel Aviv University","institution_ids":["https://openalex.org/I16391192"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5072571599","display_name":"Raja Giryes","orcid":"https://orcid.org/0000-0002-2830-0297"},"institutions":[{"id":"https://openalex.org/I16391192","display_name":"Tel Aviv University","ror":"https://ror.org/04mhzgx49","country_code":"IL","type":"education","lineage":["https://openalex.org/I16391192"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Raja Giryes","raw_affiliation_strings":["Tel Aviv University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tel Aviv University","institution_ids":["https://openalex.org/I16391192"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.9349,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.77198906,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"29095","last_page":"29108"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9897000193595886,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9897000193595886,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9794999957084656,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9768000245094299,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.7776923179626465},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7579681277275085},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6317249536514282},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5459737181663513},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3291240334510803}],"concepts":[{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.7776923179626465},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7579681277275085},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6317249536514282},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5459737181663513},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3291240334510803},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.acl-long.1411","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-long.1411","pdf_url":"https://aclanthology.org/2025.acl-long.1411.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.acl-long.1411","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.acl-long.1411","pdf_url":"https://aclanthology.org/2025.acl-long.1411.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.5099999904632568,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320322596","display_name":"Tel Aviv University","ror":"https://ror.org/04mhzgx49"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4412889861.pdf","grobid_xml":"https://content.openalex.org/works/W4412889861.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2185469136","https://openalex.org/W2011264131","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4306353150","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Vision-language":[0],"models":[1,55],"(VLMs)":[2],"excel":[3],"at":[4,184],"extracting":[5],"and":[6,78,81,173],"reasoning":[7,179],"about":[8,18,34],"information":[9,97,123],"from":[10,93,99,125],"images.Yet,":[11],"their":[12,178],"capacity":[13],"to":[14,84,102,110],"leverage":[15],"internal":[16,169],"knowledge":[17],"specific":[19],"entities":[20],"remains":[21],"underexplored.This":[22],"work":[23],"investigates":[24],"the":[25,57,119,131,140,157,168],"disparity":[26],"in":[27,38,42,95,130,139,155],"model":[28,158],"performance":[29],"when":[30],"answering":[31],"factual":[32],"questions":[33],"an":[35,43],"entity":[36,58,76],"described":[37],"text":[39],"versus":[40],"depicted":[41],"image.Our":[44],"results":[45],"reveal":[46,111],"a":[47,71,152],"significant":[48],"accuracy":[49],"drop":[50],"-reaching":[51],"18%":[52],"for":[53,148,162,176],"some":[54],"-when":[56],"is":[59],"presented":[60],"visually":[61],"instead":[62],"of":[63,171],"textually.To":[64],"study":[65],"this":[66,90,185],"gap":[67],"we":[68,105],"present":[69],"POPVQA,":[70],"dataset":[72],"which":[73],"allows":[74],"separating":[75],"recognition":[77],"question":[79],"answering,":[80],"use":[82,106],"it":[83],"benchmark":[85],"several":[86],"models.We":[87],"hypothesize":[88],"that":[89],"decline":[91],"arises":[92],"limitations":[94],"how":[96,156],"flows":[98],"image":[100,114,136],"tokens":[101,115,127],"query":[103],"tokens.Thus,":[104],"mechanistic":[107],"interpretability":[108],"tools":[109],"that,":[112],"although":[113],"are":[116],"preprocessed":[117],"by":[118],"vision":[120],"encoder,":[121],"meaningful":[122],"flow":[124],"these":[126],"occurs":[128],"only":[129],"much":[132],"deeper":[133],"layers.Furthermore,":[134],"critical":[135],"processing":[137],"happens":[138],"language":[141],"model's":[142],"middle":[143],"layers,":[144],"allowing":[145],"few":[146],"layers":[147,161],"consecutive":[149],"reasoning,":[150],"highlighting":[151],"potential":[153],"inefficiency":[154],"utilizes":[159],"its":[160],"reasoning.These":[163],"insights":[164],"shed":[165],"light":[166],"on":[167],"mechanics":[170],"VLMs":[172],"offer":[174],"pathways":[175],"enhancing":[177],"capabilities.POPVQA":[180],"can":[181],"be":[182],"found":[183],"link.":[186]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
