{"id":"https://openalex.org/W7135045332","doi":"https://doi.org/10.1162/tacl.a.653","title":"Do Large Multimodal Models Solve Caption Generation for Scientific Figures? Lessons Learned from <scp>SciCap</scp> Challenge 2023","display_name":"Do Large Multimodal Models Solve Caption Generation for Scientific Figures? Lessons Learned from <scp>SciCap</scp> Challenge 2023","publication_year":2026,"publication_date":"2026-03-05","ids":{"openalex":"https://openalex.org/W7135045332","doi":"https://doi.org/10.1162/tacl.a.653"},"language":"en","primary_location":{"id":"doi:10.1162/tacl.a.653","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl.a.653","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/TACL.a.653/2587241/tacl.a.653.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/TACL.a.653/2587241/tacl.a.653.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Ting-Yao \u2018Edward\u2019 Hsu","orcid":null},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ting-Yao \u2018Edward\u2019 Hsu","raw_affiliation_strings":["Pennsylvania State University, USA. txh357@psu.edu"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Pennsylvania State University, USA. txh357@psu.edu","institution_ids":["https://openalex.org/I130769515"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064187871","display_name":"Yi-Li Hsu","orcid":null},"institutions":[{"id":"https://openalex.org/I25846049","display_name":"National Tsing Hua University","ror":"https://ror.org/00zdnkx70","country_code":"TW","type":"education","lineage":["https://openalex.org/I25846049"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Yi-Li Hsu","raw_affiliation_strings":["National Tsing Hua University, Taiwan. yili.hsu@iis.sinica.edu.tw"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Tsing Hua University, Taiwan. yili.hsu@iis.sinica.edu.tw","institution_ids":["https://openalex.org/I25846049"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113758586","display_name":"Shaurya Rohatgi","orcid":null},"institutions":[{"id":"https://openalex.org/I4210140341","display_name":"Allen Institute","ror":"https://ror.org/03cpe7c52","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210140341"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shaurya Rohatgi","raw_affiliation_strings":["AllSci, USA. srohatgi@allsci.com"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"AllSci, USA. srohatgi@allsci.com","institution_ids":["https://openalex.org/I4210140341"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102729368","display_name":"Chieh-Yang Huang","orcid":"https://orcid.org/0009-0001-6736-9959"},"institutions":[{"id":"https://openalex.org/I4210106763","display_name":"MetaMetrics (United States)","ror":"https://ror.org/01m1p2k59","country_code":"US","type":"company","lineage":["https://openalex.org/I4210106763"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chieh-Yang Huang","raw_affiliation_strings":["MetaMetrics Inc., USA. cyhuang@lexile.com"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"MetaMetrics Inc., USA. cyhuang@lexile.com","institution_ids":["https://openalex.org/I4210106763"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110081955","display_name":"Hwee Tou Ng","orcid":null},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ho Yin Sam Ng","raw_affiliation_strings":["Pennsylvania State University, USA. sam.ng@psu.edu"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Pennsylvania State University, USA. sam.ng@psu.edu","institution_ids":["https://openalex.org/I130769515"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128918734","display_name":"Ryan Rossi","orcid":null},"institutions":[{"id":"https://openalex.org/I1306409833","display_name":"Adobe Systems (United States)","ror":"https://ror.org/059tvcg64","country_code":"US","type":"company","lineage":["https://openalex.org/I1306409833"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ryan Rossi","raw_affiliation_strings":["Adobe Research, USA. ryrossi@adobe.com"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Adobe Research, USA. ryrossi@adobe.com","institution_ids":["https://openalex.org/I1306409833"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100718934","display_name":"Sungchul Kim","orcid":"https://orcid.org/0000-0003-3580-5290"},"institutions":[{"id":"https://openalex.org/I1306409833","display_name":"Adobe Systems (United States)","ror":"https://ror.org/059tvcg64","country_code":"US","type":"company","lineage":["https://openalex.org/I1306409833"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sungchul Kim","raw_affiliation_strings":["Adobe Research, USA. sukim@adobe.com"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Adobe Research, USA. sukim@adobe.com","institution_ids":["https://openalex.org/I1306409833"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128803885","display_name":"Tong Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I1306409833","display_name":"Adobe Systems (United States)","ror":"https://ror.org/059tvcg64","country_code":"US","type":"company","lineage":["https://openalex.org/I1306409833"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tong Yu","raw_affiliation_strings":["Adobe Research, USA. tyu@adobe.com"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Adobe Research, USA. tyu@adobe.com","institution_ids":["https://openalex.org/I1306409833"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128806770","display_name":"Lun-Wei Ku","orcid":null},"institutions":[{"id":"https://openalex.org/I4210098366","display_name":"Institute of Information Science, Academia Sinica","ror":"https://ror.org/00z83z196","country_code":"TW","type":"facility","lineage":["https://openalex.org/I4210098366","https://openalex.org/I84653119"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Lun-Wei Ku","raw_affiliation_strings":["Institute of Information Science, Academia Sinica, Taiwan. lwku@iis.sinica.edu.tw"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute of Information Science, Academia Sinica, Taiwan. lwku@iis.sinica.edu.tw","institution_ids":["https://openalex.org/I4210098366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128851154","display_name":"Clyde Lee Giles","orcid":null},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Clyde Lee Giles","raw_affiliation_strings":["Pennsylvania State University, USA. clg20@psu.edu"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Pennsylvania State University, USA. clg20@psu.edu","institution_ids":["https://openalex.org/I130769515"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5126418139","display_name":"Ting-Hao \u2018Kenneth\u2019 Huang","orcid":null},"institutions":[{"id":"https://openalex.org/I130769515","display_name":"Pennsylvania State University","ror":"https://ror.org/04p491231","country_code":"US","type":"education","lineage":["https://openalex.org/I130769515"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ting-Hao \u2018Kenneth\u2019 Huang","raw_affiliation_strings":["Pennsylvania State University, USA. txh710@psu.edu"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Pennsylvania State University, USA. txh710@psu.edu","institution_ids":["https://openalex.org/I130769515"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I130769515"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.38070386,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"14","issue":null,"first_page":"233","last_page":"252"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5735999941825867,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5735999941825867,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2290000021457672,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.045499999076128006,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.8763999938964844},{"id":"https://openalex.org/keywords/snapshot","display_name":"Snapshot (computer storage)","score":0.6800000071525574},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5311999917030334},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.41670000553131104},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.36329999566078186},{"id":"https://openalex.org/keywords/topic-model","display_name":"Topic model","score":0.34869998693466187}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.8763999938964844},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.847599983215332},{"id":"https://openalex.org/C55282118","wikidata":"https://www.wikidata.org/wiki/Q252683","display_name":"Snapshot (computer storage)","level":2,"score":0.6800000071525574},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.5550000071525574},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5311999917030334},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4494999945163727},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.41670000553131104},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.36329999566078186},{"id":"https://openalex.org/C171686336","wikidata":"https://www.wikidata.org/wiki/Q3532085","display_name":"Topic model","level":2,"score":0.34869998693466187},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3093000054359436},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2874999940395355},{"id":"https://openalex.org/C2984558057","wikidata":"https://www.wikidata.org/wiki/Q162633","display_name":"Academic community","level":2,"score":0.28369998931884766},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.27619999647140503},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.267300009727478},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2603999972343445},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.2533999979496002}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1162/tacl.a.653","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl.a.653","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/TACL.a.653/2587241/tacl.a.653.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1162/tacl.a.653","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl.a.653","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/TACL.a.653/2587241/tacl.a.653.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320306151","display_name":"Alfred P. Sloan Foundation","ror":"https://ror.org/052csg198"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7135045332.pdf","grobid_xml":"https://content.openalex.org/works/W7135045332.grobid-xml"},"referenced_works_count":28,"referenced_works":["https://openalex.org/W2060374126","https://openalex.org/W2095560558","https://openalex.org/W2101105183","https://openalex.org/W2125021734","https://openalex.org/W2127712852","https://openalex.org/W2252152825","https://openalex.org/W2398579902","https://openalex.org/W2416987009","https://openalex.org/W2520300089","https://openalex.org/W2602628042","https://openalex.org/W2952523122","https://openalex.org/W2964237709","https://openalex.org/W2996849306","https://openalex.org/W3010343788","https://openalex.org/W3116465435","https://openalex.org/W3153965221","https://openalex.org/W3211402614","https://openalex.org/W4205384772","https://openalex.org/W4225934689","https://openalex.org/W4389009572","https://openalex.org/W4389518794","https://openalex.org/W4389518939","https://openalex.org/W4389519125","https://openalex.org/W4389523846","https://openalex.org/W4393247979","https://openalex.org/W4394625790","https://openalex.org/W4404780528","https://openalex.org/W4411630291"],"related_works":[],"abstract_inverted_index":{"Abstract":[0],"Since":[1],"the":[2,8,26,54,85,91,103,126,148],"SciCap":[3,28,39,87],"dataset\u2019s":[4],"launch":[5],"in":[6,15,21,75],"2021,":[7],"research":[9],"community":[10],"has":[11],"made":[12],"significant":[13],"progress":[14],"generating":[16,151],"captions":[17,114,128,152],"for":[18,44,153],"scientific":[19,154],"figures":[20],"scholarly":[22],"articles.":[23],"In":[24],"2023,":[25],"first":[27,86],"Challenge":[29,88],"took":[30],"place,":[31],"inviting":[32],"global":[33],"teams":[34],"to":[35,41,140],"use":[36],"an":[37,82],"expanded":[38],"dataset":[40],"develop":[42],"models":[43,59,68,95,123],"captioning":[45],"diverse":[46],"figure":[47,113],"types":[48],"across":[49],"various":[50,76,94],"academic":[51],"fields.":[52],"At":[53],"same":[55],"time,":[56],"text":[57],"generation":[58],"advanced":[60,145],"quickly,":[61],"with":[62],"many":[63],"powerful":[64],"pre-trained":[65],"large":[66],"multimodal":[67],"(LMMs)":[69],"emerging":[70],"that":[71,108],"showed":[72],"impressive":[73],"capabilities":[74],"vision-and-language":[77],"tasks.":[78],"This":[79],"paper":[80],"presents":[81],"overview":[83],"of":[84,93,102,150],"and":[89,124],"details":[90],"performance":[92],"on":[96],"its":[97],"data,":[98],"capturing":[99],"a":[100],"snapshot":[101],"field\u2019s":[104],"state.":[105],"We":[106],"found":[107],"professional":[109],"editors":[110],"overwhelmingly":[111],"preferred":[112],"generated":[115],"by":[116,130],"GPT-4V":[117],"over":[118],"those":[119],"from":[120],"all":[121],"other":[122],"even":[125],"original":[127],"written":[129],"authors.":[131],"Following":[132],"this":[133,142],"key":[134],"finding,":[135],"we":[136],"conducted":[137],"detailed":[138],"analyses":[139],"answer":[141],"question:":[143],"Have":[144],"LMMs":[146],"solved":[147],"task":[149],"figures?":[155]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2026-03-13T00:00:00"}
