{"id":"https://openalex.org/W7155168445","doi":"https://doi.org/10.48550/arxiv.2604.19697","title":"Unveiling Fine-Grained Visual Traces: Evaluating Multimodal Interleaved Reasoning Chains in Multimodal STEM Tasks","display_name":"Unveiling Fine-Grained Visual Traces: Evaluating Multimodal Interleaved Reasoning Chains in Multimodal STEM Tasks","publication_year":2026,"publication_date":"2026-04-21","ids":{"openalex":"https://openalex.org/W7155168445","doi":"https://doi.org/10.48550/arxiv.2604.19697"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.19697","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.19697","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.19697","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134304975","display_name":"Jing Jin","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jin, Jing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134274563","display_name":"Hao Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Hao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134333853","display_name":"Yan Bai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Yan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063901345","display_name":"Yihang Lou","orcid":"https://orcid.org/0000-0002-8143-389X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lou, Yihang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134335851","display_name":"Zhenke Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhenke","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134298001","display_name":"Tianrun Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Tianrun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134316121","display_name":"Juntong Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Juntong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053939397","display_name":"Yongkang Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Yongkang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062508341","display_name":"Fanhu Zeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeng, Fanhu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103097433","display_name":"Xuanyu Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Xuanyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054178885","display_name":"Yige Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Tao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Xu, Yige","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Yige","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5134304975"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9361000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9361000061035156,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.012299999594688416,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.003599999938160181,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/abductive-reasoning","display_name":"Abductive reasoning","score":0.5357999801635742},{"id":"https://openalex.org/keywords/automated-reasoning","display_name":"Automated reasoning","score":0.49309998750686646},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.4672999978065491},{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.44589999318122864},{"id":"https://openalex.org/keywords/reasoning-system","display_name":"Reasoning system","score":0.39559999108314514},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.3831000030040741},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.37790000438690186},{"id":"https://openalex.org/keywords/verifiable-secret-sharing","display_name":"Verifiable secret sharing","score":0.34619998931884766}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7681999802589417},{"id":"https://openalex.org/C166088908","wikidata":"https://www.wikidata.org/wiki/Q308495","display_name":"Abductive reasoning","level":2,"score":0.5357999801635742},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5202999711036682},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.49309998750686646},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.4672999978065491},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.44589999318122864},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.39559999108314514},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3831000030040741},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.37790000438690186},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.36640000343322754},{"id":"https://openalex.org/C85847156","wikidata":"https://www.wikidata.org/wiki/Q59015987","display_name":"Verifiable secret sharing","level":3,"score":0.34619998931884766},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34450000524520874},{"id":"https://openalex.org/C202269582","wikidata":"https://www.wikidata.org/wiki/Q2644277","display_name":"Complementarity (molecular biology)","level":2,"score":0.3440999984741211},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.34200000762939453},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.31869998574256897},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.3073999881744385},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.28369998931884766},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.2775999903678894},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2745000123977661},{"id":"https://openalex.org/C44210515","wikidata":"https://www.wikidata.org/wiki/Q16968978","display_name":"Bespoke","level":2,"score":0.27250000834465027},{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.2694000005722046},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.26829999685287476},{"id":"https://openalex.org/C37335422","wikidata":"https://www.wikidata.org/wiki/Q6888134","display_name":"Model-based reasoning","level":3,"score":0.2578999996185303},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.2563000023365021}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.19697","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.19697","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.19697","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.19697","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7771171927452087,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"large":[1],"language":[2],"models":[3,133],"(MLLMs)":[4],"have":[5],"shown":[6],"promising":[7],"reasoning":[8,20,51,79,121,166],"abilities,":[9],"yet":[10],"evaluating":[11],"their":[12],"performance":[13],"in":[14,80],"specialized":[15],"domains":[16],"remains":[17],"challenging.":[18],"STEM":[19,165],"is":[21,83,181],"a":[22,61,86,102,129,171],"particularly":[23],"valuable":[24],"testbed":[25],"because":[26],"it":[27],"provides":[28],"highly":[29],"verifiable":[30],"feedback,":[31],"but":[32],"existing":[33],"benchmarks":[34],"often":[35],"permit":[36],"unimodal":[37],"shortcuts":[38],"due":[39],"to":[40,118],"modality":[41],"redundancy":[42],"and":[43,72,96,111,149,167],"focus":[44],"mainly":[45],"on":[46,141],"final-answer":[47],"accuracy,":[48],"overlooking":[49],"the":[50],"process":[52],"itself.":[53],"To":[54],"address":[55],"this":[56],"challenge,":[57],"we":[58],"introduce":[59],"StepSTEM:":[60],"graduate-level":[62],"benchmark":[63,172],"of":[64,77,132,176],"283":[65],"problems":[66],"across":[67,128],"mathematics,":[68],"physics,":[69],"chemistry,":[70],"biology,":[71],"engineering":[73],"for":[74,107,162,173],"fine-grained":[75,174],"evaluation":[76,105,175],"cross-modal":[78,164],"MLLMs.":[81],"StepSTEM":[82,169],"constructed":[84],"through":[85],"rigorous":[87],"curation":[88],"pipeline":[89],"that":[90,135],"enforces":[91],"strict":[92],"complementarity":[93],"between":[94],"textual":[95,142],"visual":[97],"inputs.":[98],"We":[99],"further":[100],"propose":[101],"general":[103],"step-level":[104],"framework":[106],"both":[108],"text-only":[109],"chain-of-thought":[110],"interleaved":[112],"image-text":[113],"reasoning,":[114,143],"using":[115],"dynamic":[116],"programming":[117],"align":[119],"predicted":[120],"steps":[122],"with":[123,144],"multiple":[124],"reference":[125],"solutions.":[126],"Experiments":[127],"wide":[130],"range":[131],"show":[134],"current":[136],"MLLMs":[137],"still":[138],"rely":[139],"heavily":[140],"even":[145],"Gemini":[146],"3.1":[147],"Pro":[148],"Claude":[150],"Opus":[151],"4.6":[152],"achieving":[153],"only":[154],"38.29%":[155],"accuracy.":[156],"These":[157],"results":[158],"highlight":[159],"substantial":[160],"headroom":[161],"genuine":[163],"position":[168],"as":[170],"multimodal":[177],"reasoning.":[178],"Source":[179],"code":[180],"available":[182],"at":[183],"https://github.com/lll-hhh/STEPSTEM.":[184]},"counts_by_year":[],"updated_date":"2026-05-12T06:07:45.972803","created_date":"2026-04-23T00:00:00"}
