{"id":"https://openalex.org/W4368755685","doi":"https://doi.org/10.48550/arxiv.2305.02317","title":"Visual Chain of Thought: Bridging Logical Gaps with Multimodal Infillings","display_name":"Visual Chain of Thought: Bridging Logical Gaps with Multimodal Infillings","publication_year":2023,"publication_date":"2023-05-03","ids":{"openalex":"https://openalex.org/W4368755685","doi":"https://doi.org/10.48550/arxiv.2305.02317"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2305.02317","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.02317","pdf_url":"https://arxiv.org/pdf/2305.02317","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2305.02317","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5062485317","display_name":"Daniel M. Rose","orcid":"https://orcid.org/0009-0005-6431-1080"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rose, Daniel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002253981","display_name":"Vaishnavi Himakunthala","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Himakunthala, Vaishnavi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085633542","display_name":"Andy Ouyang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ouyang, Andy","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090510466","display_name":"Ryan He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Ryan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012895395","display_name":"Alex Mei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mei, Alex","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036525093","display_name":"Yujie Lu","orcid":"https://orcid.org/0000-0002-0691-2129"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Yujie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034254782","display_name":"Michael Saxon","orcid":"https://orcid.org/0000-0001-7306-5030"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Saxon, Michael","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070263103","display_name":"Chinmay Sonar","orcid":"https://orcid.org/0000-0003-0026-4455"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sonar, Chinmay","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027209078","display_name":"Diba Mirza","orcid":"https://orcid.org/0000-0002-0969-5238"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mirza, Diba","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100702485","display_name":"William Yang Wang","orcid":"https://orcid.org/0000-0001-6153-8240"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, William Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":10,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11148","display_name":"Language, Metaphor, and Cognition","score":0.7387999892234802,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11148","display_name":"Language, Metaphor, and Cognition","score":0.7387999892234802,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.911250114440918},{"id":"https://openalex.org/keywords/chain","display_name":"Chain (unit)","score":0.4432035982608795},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.356588214635849},{"id":"https://openalex.org/keywords/geology","display_name":"Geology","score":0.34537196159362793},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.3330209255218506},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.09185081720352173},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.07911062240600586}],"concepts":[{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.911250114440918},{"id":"https://openalex.org/C199185054","wikidata":"https://www.wikidata.org/wiki/Q552299","display_name":"Chain (unit)","level":2,"score":0.4432035982608795},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.356588214635849},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.34537196159362793},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.3330209255218506},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.09185081720352173},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.07911062240600586},{"id":"https://openalex.org/C1276947","wikidata":"https://www.wikidata.org/wiki/Q333","display_name":"Astronomy","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2305.02317","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.02317","pdf_url":"https://arxiv.org/pdf/2305.02317","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2305.02317","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2305.02317","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2305.02317","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.02317","pdf_url":"https://arxiv.org/pdf/2305.02317","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.7300000190734863,"id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G6688003181","display_name":"Collaborative Research: Scaling the Early Research Scholars Program","funder_award_id":"1821415","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4368755685.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W4388870064","https://openalex.org/W2210139803","https://openalex.org/W4235186151","https://openalex.org/W2054685365","https://openalex.org/W2056057048","https://openalex.org/W2667588871","https://openalex.org/W2272354214","https://openalex.org/W2084768720"],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,8,17,28],"large":[3],"language":[4,29],"models":[5,13],"elicit":[6],"reasoning":[7,26,50],"a":[9,18,62],"chain-of-thought":[10,67,146],"that":[11,45,65,91,105,136],"allows":[12],"to":[14,40,72,86,97,123,152],"decompose":[15],"problems":[16],"human-like":[19],"fashion.":[20],"Though":[21],"this":[22],"paradigm":[23],"improves":[24],"multi-step":[25,118],"ability":[27],"models,":[30],"it":[31],"is":[32,51],"limited":[33],"by":[34],"being":[35],"unimodal":[36],"and":[37,94,127,131,140],"applied":[38],"mainly":[39],"question-answering":[41],"tasks.":[42,57],"We":[43,120],"claim":[44],"incorporating":[46],"visual":[47,84],"augmentation":[48,144],"into":[49,116],"essential,":[52],"especially":[53],"for":[54,102],"complex,":[55],"imaginative":[56],"Consequently,":[58],"we":[59],"introduce":[60],"VCoT,":[61],"novel":[63,95,139],"method":[64,82],"leverages":[66],"prompting":[68],"with":[69],"vision-language":[70],"grounding":[71],"recursively":[73],"bridge":[74],"the":[75,99,124],"logical":[76,100],"gaps":[77,101],"within":[78],"sequential":[79],"data.":[80],"Our":[81],"uses":[83],"guidance":[85],"generate":[87],"synthetic":[88,142],"multimodal":[89],"infillings":[90],"add":[92],"consistent":[93,141],"information":[96],"reduce":[98],"downstream":[103,154],"tasks":[104],"can":[106,149],"benefit":[107],"from":[108],"temporal":[109],"reasoning,":[110],"as":[111,113],"well":[112],"provide":[114],"interpretability":[115],"models'":[117],"reasoning.":[119],"apply":[121],"VCoT":[122,137],"Visual":[125],"Storytelling":[126],"WikiHow":[128],"summarization":[129],"datasets":[130],"demonstrate":[132],"through":[133],"human":[134],"evaluation":[135],"offers":[138],"data":[143],"beating":[145],"baselines,":[147],"which":[148],"be":[150],"used":[151],"enhance":[153],"performance.":[155]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":3}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2023-05-05T00:00:00"}
