{"id":"https://openalex.org/W7118313491","doi":"https://doi.org/10.1109/tmm.2026.3651070","title":"Visual Context and Commonsense-Guided Causal Chain-of-Thoughts for Visual Commonsense Reasoning","display_name":"Visual Context and Commonsense-Guided Causal Chain-of-Thoughts for Visual Commonsense Reasoning","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7118313491","doi":"https://doi.org/10.1109/tmm.2026.3651070"},"language":null,"primary_location":{"id":"doi:10.1109/tmm.2026.3651070","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2026.3651070","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5121810862","display_name":"Xinyu Li","orcid":null},"institutions":[{"id":"https://openalex.org/I66867065","display_name":"East China Normal University","ror":"https://ror.org/02n96ep67","country_code":"CN","type":"education","lineage":["https://openalex.org/I66867065"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xinyu Li","raw_affiliation_strings":["School of Computer Science and Technology, East China Normal University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0008-9059-5090","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, East China Normal University, Shanghai, China","institution_ids":["https://openalex.org/I66867065"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jing Zhao","orcid":"https://orcid.org/0000-0003-0158-5330"},"institutions":[{"id":"https://openalex.org/I66867065","display_name":"East China Normal University","ror":"https://ror.org/02n96ep67","country_code":"CN","type":"education","lineage":["https://openalex.org/I66867065"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jing Zhao","raw_affiliation_strings":["School of Computer Science and Technology, East China Normal University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0003-0158-5330","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, East China Normal University, Shanghai, China","institution_ids":["https://openalex.org/I66867065"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042163232","display_name":"Tongquan Wei","orcid":"https://orcid.org/0000-0002-7421-1711"},"institutions":[{"id":"https://openalex.org/I66867065","display_name":"East China Normal University","ror":"https://ror.org/02n96ep67","country_code":"CN","type":"education","lineage":["https://openalex.org/I66867065"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tongquan Wei","raw_affiliation_strings":["School of Computer Science and Technology, East China Normal University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-7421-1711","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, East China Normal University, Shanghai, China","institution_ids":["https://openalex.org/I66867065"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5013783407","display_name":"Shiliang Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shiliang Sun","raw_affiliation_strings":["School of Automation and Intelligent Sensing, Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0001-7069-3752","affiliations":[{"raw_affiliation_string":"School of Automation and Intelligent Sensing, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5121810862"],"corresponding_institution_ids":["https://openalex.org/I66867065"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.02375413,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"28","issue":null,"first_page":"2719","last_page":"2730"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.00039999998989515007,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.00039999998989515007,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/commonsense-reasoning","display_name":"Commonsense reasoning","score":0.9459999799728394},{"id":"https://openalex.org/keywords/commonsense-knowledge","display_name":"Commonsense knowledge","score":0.8102999925613403},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5253000259399414},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.4108999967575073},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.38190001249313354},{"id":"https://openalex.org/keywords/folk-psychology","display_name":"Folk psychology","score":0.35030001401901245},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.32659998536109924}],"concepts":[{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.9459999799728394},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8137000203132629},{"id":"https://openalex.org/C30542707","wikidata":"https://www.wikidata.org/wiki/Q1603203","display_name":"Commonsense knowledge","level":3,"score":0.8102999925613403},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5884000062942505},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5253000259399414},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.4108999967575073},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4072999954223633},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.38190001249313354},{"id":"https://openalex.org/C128482177","wikidata":"https://www.wikidata.org/wiki/Q1429140","display_name":"Folk psychology","level":2,"score":0.35030001401901245},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.32659998536109924},{"id":"https://openalex.org/C2780103172","wikidata":"https://www.wikidata.org/wiki/Q1309721","display_name":"Visual Objects","level":3,"score":0.32440000772476196},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.3098999857902527},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.30660000443458557},{"id":"https://openalex.org/C103057564","wikidata":"https://www.wikidata.org/wiki/Q4751139","display_name":"Analytic reasoning","level":3,"score":0.3005000054836273},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.27889999747276306},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2759000062942505},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2736000120639801},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.2614000141620636},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25839999318122864}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2026.3651070","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2026.3651070","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4971749353","display_name":null,"funder_award_id":"62576206","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6487037979","display_name":null,"funder_award_id":"62476089","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320312071","display_name":"Ministry of Education, Libya","ror":"https://ror.org/02w030k33"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320322370","display_name":"East China Normal University","ror":"https://ror.org/02n96ep67"},{"id":"https://openalex.org/F4320322999","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Humans":[0],"are":[1],"capable":[2],"of":[3,15,65,72,125,146,204],"inferring":[4],"dynamic":[5],"context":[6,67,92],"from":[7],"a":[8,29,82,115],"still":[9],"image":[10],"and,":[11],"with":[12,114],"the":[13,63,69,98,130,143,186,197,202,205],"provision":[14],"additional":[16],"commonsense":[17,23,73,88,94,100,132,140,170],"knowledge,":[18],"can":[19],"accurately":[20],"complete":[21],"visual":[22,87,99,111,147,169],"reasoning":[24,89,101,162,171],"tasks.":[25],"Nevertheless,":[26],"this":[27,78,126],"remains":[28],"highly":[30],"challenging":[31,59],"cognitive-level":[32],"task":[33],"for":[34,47],"current":[35],"vision-language":[36],"models.":[37],"Previous":[38],"work":[39,127],"has":[40],"primarily":[41],"focused":[42],"on":[43,180],"utilizing":[44],"models":[45,113,195],"fine-tuned":[46],"specific":[48],"downstream":[49,182],"tasks":[50,183],"and":[51,68,93,109,154,196,200],"introduces":[52],"external":[53],"world":[54],"knowledge":[55,74,133,141],"to":[56,85,119,168,193],"tackle":[57],"these":[58],"tasks,":[60],"while":[61],"neglecting":[62],"importance":[64],"accurate":[66],"key":[70],"role":[71],"in":[75,129],"reasoning.":[76,122],"In":[77],"paper,":[79],"we":[80],"propose":[81],"novel":[83],"framework":[84],"enhance":[86],"by":[90,172],"incorporating":[91,173],"knowledge.":[95],"We":[96],"decompose":[97],"problem":[102],"into":[103],"four":[104],"distinct":[105],"but":[106],"interrelated":[107],"sub-problems":[108],"combine":[110],"language":[112,117],"large":[116],"model":[118],"enable":[120],"zero-shot":[121],"The":[123],"uniqueness":[124],"lies":[128],"proposed":[131,187,206],"filtering":[134],"module,":[135],"which":[136],"filters":[137],"out":[138],"relevant":[139],"through":[142],"causal":[144],"strength":[145],"context.":[148],"This":[149],"process":[150],"constructs":[151],"Visual":[152],"Context":[153],"Commonsense-guided":[155],"Causal":[156],"Chain-of-Thought":[157],"(<inline-formula":[158],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[159],"xmlns:xlink=\"http://www.w3.org/1999/xlink\"><tex-math":[160],"notation=\"LaTeX\">$\\mathrm{VC^{3}}$</tex-math></inline-formula>-CoT)":[161],"paths,":[163],"thereby":[164],"providing":[165],"double":[166],"robustness":[167],"weighted":[174],"majority":[175],"voting":[176],"strategy.":[177],"Extensive":[178],"experiments":[179],"several":[181],"demonstrate":[184],"that":[185],"method":[188],"significantly":[189],"improves":[190],"performance":[191],"compared":[192],"baseline":[194],"state-of-the-art":[198],"method,":[199],"confirm":[201],"effectiveness":[203],"components.":[207]},"counts_by_year":[],"updated_date":"2026-04-18T05:59:34.339393","created_date":"2026-01-08T00:00:00"}
