{"id":"https://openalex.org/W7138462990","doi":"https://doi.org/10.1609/aaai.v40i15.38236","title":"CMMCoT: Enhancing Complex Multi-Image Comprehension via Multi-Modal Chain-of-Thought and Memory Augmentation","display_name":"CMMCoT: Enhancing Complex Multi-Image Comprehension via Multi-Modal Chain-of-Thought and Memory Augmentation","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138462990","doi":"https://doi.org/10.1609/aaai.v40i15.38236"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i15.38236","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i15.38236","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/38236/42198","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://ojs.aaai.org/index.php/AAAI/article/download/38236/42198","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129653612","display_name":"Guanghao Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Guanghao Zhang","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129720248","display_name":"Tao Zhong","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tao Zhong","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129746140","display_name":"Yan Xia","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]},{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Yan Xia","raw_affiliation_strings":["Zhejiang University\nAlibaba Group"],"affiliations":[{"raw_affiliation_string":"Zhejiang University\nAlibaba Group","institution_ids":["https://openalex.org/I76130692","https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039853684","display_name":"Mushui Liu","orcid":"https://orcid.org/0000-0002-2909-7702"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]},{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Mushui Liu","raw_affiliation_strings":["Zhejiang University\nAlibaba Group"],"affiliations":[{"raw_affiliation_string":"Zhejiang University\nAlibaba Group","institution_ids":["https://openalex.org/I76130692","https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085870521","display_name":"Zhelun Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhelun Yu","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129720652","display_name":"Haoyuan Li","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Haoyuan Li","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129700161","display_name":"Wanggui He","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Wanggui He","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129676755","display_name":"Dong She","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dong She","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129694574","display_name":"Yi Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]},{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Yi Wang","raw_affiliation_strings":["Zhejiang University\nAlibaba Group"],"affiliations":[{"raw_affiliation_string":"Zhejiang University\nAlibaba Group","institution_ids":["https://openalex.org/I76130692","https://openalex.org/I4210095624"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129677917","display_name":"Hao Jiang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hao Jiang","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5129653612"],"corresponding_institution_ids":["https://openalex.org/I4210095624"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.82284041,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"15","first_page":"12430","last_page":"12438"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8932999968528748,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8932999968528748,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.014800000004470348,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.009499999694526196,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.6646999716758728},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.609000027179718},{"id":"https://openalex.org/keywords/comprehension","display_name":"Comprehension","score":0.5792999863624573},{"id":"https://openalex.org/keywords/memorization","display_name":"Memorization","score":0.5634999871253967},{"id":"https://openalex.org/keywords/abductive-reasoning","display_name":"Abductive reasoning","score":0.4535999894142151},{"id":"https://openalex.org/keywords/cognition","display_name":"Cognition","score":0.45080000162124634},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.43860000371932983},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.43799999356269836},{"id":"https://openalex.org/keywords/analytic-reasoning","display_name":"Analytic reasoning","score":0.4271000027656555}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6715999841690063},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.6646999716758728},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.609000027179718},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.5792999863624573},{"id":"https://openalex.org/C30038468","wikidata":"https://www.wikidata.org/wiki/Q4354775","display_name":"Memorization","level":2,"score":0.5634999871253967},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5196999907493591},{"id":"https://openalex.org/C166088908","wikidata":"https://www.wikidata.org/wiki/Q308495","display_name":"Abductive reasoning","level":2,"score":0.4535999894142151},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.45080000162124634},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.43860000371932983},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.43799999356269836},{"id":"https://openalex.org/C103057564","wikidata":"https://www.wikidata.org/wiki/Q4751139","display_name":"Analytic reasoning","level":3,"score":0.4271000027656555},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.4068000018596649},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3508000075817108},{"id":"https://openalex.org/C83725634","wikidata":"https://www.wikidata.org/wiki/Q7268699","display_name":"Qualitative reasoning","level":2,"score":0.3467999994754791},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3384999930858612},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.3278000056743622},{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.32109999656677246},{"id":"https://openalex.org/C86827895","wikidata":"https://www.wikidata.org/wiki/Q7098582","display_name":"Opportunistic reasoning","level":4,"score":0.31839999556541443},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.31290000677108765},{"id":"https://openalex.org/C36964233","wikidata":"https://www.wikidata.org/wiki/Q7920942","display_name":"Verbal reasoning","level":3,"score":0.3075999915599823},{"id":"https://openalex.org/C159032336","wikidata":"https://www.wikidata.org/wiki/Q2488768","display_name":"Non-monotonic logic","level":2,"score":0.3068000078201294},{"id":"https://openalex.org/C97364631","wikidata":"https://www.wikidata.org/wiki/Q484284","display_name":"Deductive reasoning","level":2,"score":0.3000999987125397},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.2987000048160553},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2987000048160553},{"id":"https://openalex.org/C21963081","wikidata":"https://www.wikidata.org/wiki/Q11337567","display_name":"Working memory","level":3,"score":0.29820001125335693},{"id":"https://openalex.org/C37335422","wikidata":"https://www.wikidata.org/wiki/Q6888134","display_name":"Model-based reasoning","level":3,"score":0.2971999943256378},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.2590000033378601},{"id":"https://openalex.org/C12186640","wikidata":"https://www.wikidata.org/wiki/Q6815743","display_name":"Memory model","level":3,"score":0.2531999945640564}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i15.38236","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i15.38236","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/38236/42198","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i15.38236","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i15.38236","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/38236/42198","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1270634530","display_name":null,"funder_award_id":"LD24F020016","funder_id":"https://openalex.org/F4320338464","funder_display_name":"Natural Science Foundation of Zhejiang Province"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320338464","display_name":"Natural Science Foundation of Zhejiang Province","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7138462990.pdf","grobid_xml":"https://content.openalex.org/works/W7138462990.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0,38],"previous":[1],"multimodal":[2,109],"slow-thinking":[3,174],"methods":[4],"have":[5,169],"demonstrated":[6],"remarkable":[7],"success":[8],"in":[9,43,165],"single-image":[10],"understanding":[11,134],"scenarios,":[12],"their":[13,30],"effectiveness":[14,180],"becomes":[15],"fundamentally":[16],"constrained":[17],"when":[18,41],"extended":[19],"to":[20,162],"more":[21],"complex":[22],"multi-image":[23,45,96,173],"comprehension":[24],"tasks.":[25],"This":[26,127],"limitation":[27],"stems":[28],"from":[29,120],"predominant":[31],"reliance":[32],"on":[33],"text-based":[34],"intermediate":[35,121],"reasoning":[36,72,88,111,122,153],"processes.":[37],"for":[39,95],"human,":[40],"engaging":[42],"sophisticated":[44],"analysis,":[46],"they":[47],"typically":[48],"perform":[49],"two":[50,101],"complementary":[51],"cognitive":[52],"operations:":[53],"(1)":[54,104],"continuous":[55],"cross-image":[56],"visual":[57,68,116],"comparison":[58],"through":[59],"region-of-interest":[60],"matching,":[61],"and":[62],"(2)":[63,140],"dynamic":[64],"memorization":[65],"of":[66,107,143,181],"critical":[67,115],"concepts":[69],"throughout":[70],"the":[71,80,151,179],"chain.":[73],"Motivated":[74],"by":[75],"these":[76],"observations,":[77],"we":[78,168],"propose":[79],"Complex":[81],"Multi-Modal":[82],"Chain-of-Thought":[83],"(CMMCoT)":[84],"framework,":[85],"a":[86,144,171],"multi-step":[87,110],"framework":[89],"that":[90,149],"mimics":[91],"human-like":[92],"\"slow":[93],"thinking\"":[94],"understanding.":[97],"Our":[98],"approach":[99],"incorporates":[100],"key":[102],"innovations:":[103],"The":[105,141],"construction":[106],"interleaved":[108],"chains,":[112],"which":[113],"utilize":[114],"region":[117],"tokens,":[118],"extracted":[119],"steps,":[123],"as":[124],"supervisory":[125],"signals.":[126],"mechanism":[128],"not":[129],"only":[130],"facilitates":[131],"comprehensive":[132],"cross-modal":[133],"but":[135],"also":[136],"enhances":[137],"model":[138],"interpretability.":[139],"introduction":[142],"test-time":[145],"memory":[146],"augmentation":[147],"module":[148],"expands":[150],"model\u2019s":[152],"capacity":[154],"during":[155],"inference":[156],"while":[157],"preserving":[158],"parameter":[159],"efficiency.":[160],"Furthermore,":[161],"facilitate":[163],"research":[164],"this":[166],"direction,":[167],"curated":[170],"novel":[172],"dataset.":[175],"Extensive":[176],"experiments":[177],"demonstrate":[178],"our":[182],"model.":[183]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2026-03-18T00:00:00"}
