{"id":"https://openalex.org/W7151338543","doi":"https://doi.org/10.48550/arxiv.2604.04172","title":"GENFIG1: Visual Summaries of Scholarly Work as a Challenge for Vision-Language Models","display_name":"GENFIG1: Visual Summaries of Scholarly Work as a Challenge for Vision-Language Models","publication_year":2026,"publication_date":"2026-04-05","ids":{"openalex":"https://openalex.org/W7151338543","doi":"https://doi.org/10.48550/arxiv.2604.04172"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.04172","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04172","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.04172","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5056154248","display_name":"Yutong Guan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Guan, Yaohan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048611471","display_name":"P. Wang","orcid":"https://orcid.org/0009-0003-3059-8795"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Pristina","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133086663","display_name":"Najim Dehak","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dehak, Najim","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133078947","display_name":"Alan Yuille","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuille, Alan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133110648","display_name":"Jieneng Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jieneng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133141998","display_name":"Daniel Khashabi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Khashabi, Daniel","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5056154248"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8048999905586243,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8048999905586243,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10799","display_name":"Data Visualization and Analytics","score":0.04989999905228615,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.04470000043511391,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6985999941825867},{"id":"https://openalex.org/keywords/grasp","display_name":"GRASP","score":0.6782000064849854},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.6477000117301941},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6373999714851379},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.6119999885559082},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.5557000041007996},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.5375000238418579},{"id":"https://openalex.org/keywords/computational-model","display_name":"Computational model","score":0.39890000224113464}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7287999987602234},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6985999941825867},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.6782000064849854},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.6477000117301941},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6373999714851379},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.6119999885559082},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.5557000041007996},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.5375000238418579},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5159000158309937},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.39890000224113464},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3986000120639801},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.38999998569488525},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.38019999861717224},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.3747999966144562},{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.32989999651908875},{"id":"https://openalex.org/C2777055276","wikidata":"https://www.wikidata.org/wiki/Q7936580","display_name":"Visual approach","level":2,"score":0.3212999999523163},{"id":"https://openalex.org/C2777655017","wikidata":"https://www.wikidata.org/wiki/Q1501161","display_name":"Toolbox","level":2,"score":0.3116999864578247},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3077999949455261},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.3057999908924103},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3050999939441681},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.2937000095844269},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.2867000102996826},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.25999999046325684},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.2540000081062317}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.04172","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04172","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.04172","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04172","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.826528012752533,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0],"many":[1],"science":[2,41],"papers,":[3],"\"Figure":[4],"1\"":[5],"serves":[6,203],"as":[7,85,204],"the":[8,13,38,73,96,117,121,125,147,151,189,196],"primary":[9],"visual":[10,42,108],"summary":[11],"of":[12,40,76,120,181],"core":[14],"research":[15],"idea.":[16],"These":[17],"figures":[18,67],"are":[19],"visually":[20,93,142],"simple":[21],"yet":[22],"conceptually":[23],"rich,":[24],"often":[25],"requiring":[26],"significant":[27,192],"effort":[28],"and":[29,71,82,115,129,134,143,164,186],"iteration":[30],"by":[31],"human":[32,175],"authors":[33],"to":[34,65,146],"get":[35],"right,":[36],"highlighting":[37],"difficulty":[39],"communication.":[43],"With":[44],"this":[45,201],"intuition,":[46],"we":[47],"introduce":[48,165],"GENFIG1,":[49],"a":[50,77,132,179,205],"benchmark":[51,152,202],"for":[52,62,100,195,207],"generative":[53],"AI":[54],"models":[55,61,111,183],"(e.g.,":[56],"Vision-Language":[57],"Models).":[58],"GENFIG1":[59,88,185],"evaluates":[60],"their":[63],"ability":[64],"produce":[66],"that":[68,103,138,170,188],"clearly":[69],"express":[70],"motivate":[72],"central":[74],"idea":[75],"paper":[78],"(title,":[79],"abstract,":[80],"introduction,":[81],"figure":[83],"caption)":[84],"input.":[86,148],"Solving":[87],"requires":[89],"more":[90],"than":[91],"producing":[92],"appealing":[94],"graphics:":[95],"task":[97,190],"entails":[98],"reasoning":[99],"text-to-image":[101],"generation":[102],"couples":[104],"scientific":[105],"understanding":[106],"with":[107,173],"synthesis.":[109],"Specifically,":[110],"must":[112],"(i)":[113],"comprehend":[114],"grasp":[116],"technical":[118],"concepts":[119,141],"paper,":[122],"(ii)":[123],"identify":[124],"most":[126],"salient":[127],"ones,":[128],"(iii)":[130],"design":[131],"coherent":[133],"aesthetically":[135],"effective":[136],"graphic":[137],"conveys":[139],"those":[140],"is":[144],"faithful":[145],"We":[149,177,199],"curate":[150],"from":[153],"papers":[154],"published":[155],"at":[156],"top":[157],"deep-learning":[158],"conferences,":[159],"apply":[160],"stringent":[161],"quality":[162],"control,":[163],"an":[166],"automatic":[167],"evaluation":[168],"metric":[169],"correlates":[171],"well":[172],"expert":[174],"judgments.":[176],"evaluate":[178],"suite":[180],"representative":[182],"on":[184],"demonstrate":[187],"presents":[191],"challenges,":[193],"even":[194],"best-performing":[197],"systems.":[198],"hope":[200],"foundation":[206],"future":[208],"progress":[209],"in":[210],"multimodal":[211],"AI.":[212]},"counts_by_year":[],"updated_date":"2026-04-08T06:07:18.267832","created_date":"2026-04-08T00:00:00"}
