{"id":"https://openalex.org/W7135043166","doi":"https://doi.org/10.48550/arxiv.2603.10757","title":"CodePercept: Code-Grounded Visual STEM Perception for MLLMs","display_name":"CodePercept: Code-Grounded Visual STEM Perception for MLLMs","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7135043166","doi":"https://doi.org/10.48550/arxiv.2603.10757"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.10757","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10757","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.10757","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5059748327","display_name":"Tongkun Guan","orcid":"https://orcid.org/0000-0003-3346-8315"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Guan, Tongkun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128900065","display_name":"Zhibo Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Zhibo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111582686","display_name":"Jianqiang Wan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wan, Jianqiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128866967","display_name":"Mingkun Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Mingkun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128871803","display_name":"Zhengtao Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Zhengtao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128841291","display_name":"Zijian Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Zijian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128920108","display_name":"Ruilin Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Ruilin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128843092","display_name":"Ruize Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Ruize","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128889692","display_name":"Songtao Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Songtao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128880980","display_name":"Peng Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Peng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128883044","display_name":"Wei Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128852673","display_name":"Junyang Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Junyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128833932","display_name":"Xiaokang Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Xiaokang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":["https://openalex.org/A5059748327"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.769599974155426,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.769599974155426,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.03689999878406525,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12859","display_name":"Cell Image Analysis Techniques","score":0.014999999664723873,"subfield":{"id":"https://openalex.org/subfields/1304","display_name":"Biophysics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/executable","display_name":"Executable","score":0.6572999954223633},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.6377999782562256},{"id":"https://openalex.org/keywords/ambiguity","display_name":"Ambiguity","score":0.5015000104904175},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.49970000982284546},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.42149999737739563},{"id":"https://openalex.org/keywords/codebase","display_name":"Codebase","score":0.37610000371932983},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.36579999327659607},{"id":"https://openalex.org/keywords/abstraction","display_name":"Abstraction","score":0.36419999599456787}],"concepts":[{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.6572999954223633},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6553000211715698},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.6377999782562256},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5388000011444092},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.5015000104904175},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.49970000982284546},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.42149999737739563},{"id":"https://openalex.org/C51929080","wikidata":"https://www.wikidata.org/wiki/Q2425187","display_name":"Codebase","level":3,"score":0.37610000371932983},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.36579999327659607},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.36419999599456787},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.3546999990940094},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.34369999170303345},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.33550000190734863},{"id":"https://openalex.org/C200220432","wikidata":"https://www.wikidata.org/wiki/Q7936208","display_name":"Vision science","level":2,"score":0.32170000672340393},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3206999897956848},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.31439998745918274},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2969000041484833},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.2906000018119812},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.29030001163482666},{"id":"https://openalex.org/C2781067378","wikidata":"https://www.wikidata.org/wiki/Q17027399","display_name":"Interpretability","level":2,"score":0.27790001034736633},{"id":"https://openalex.org/C153207627","wikidata":"https://www.wikidata.org/wiki/Q863873","display_name":"Code word","level":3,"score":0.260699987411499}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.10757","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10757","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.10757","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10757","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.6719657182693481}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"When":[0],"MLLMs":[1,73],"fail":[2],"at":[3,212],"Science,":[4],"Technology,":[5],"Engineering,":[6],"and":[7,33,206],"Mathematics":[8],"(STEM)":[9],"visual":[10,57,171,195],"reasoning,":[11],"a":[12,38,78,100,165,184],"fundamental":[13],"question":[14],"arises:":[15],"is":[16,210],"it":[17],"due":[18],"to":[19,144],"perceptual":[20,80],"deficiencies":[21],"or":[22],"reasoning":[23,34],"limitations?":[24],"Through":[25],"systematic":[26],"scaling":[27,41,45],"analysis":[28],"that":[29,86,107,168,186],"independently":[30],"scales":[31],"perception":[32,42,49,70,155,172],"components,":[35],"we":[36,97,161],"uncover":[37],"critical":[39],"insight:":[40],"consistently":[43],"outperforms":[44],"reasoning.":[46,58],"This":[47],"reveals":[48],"as":[50,77,123,183],"the":[51,69,90,130,149],"true":[52],"lever":[53],"limiting":[54],"current":[55],"STEM":[56,94,139,174],"Motivated":[59],"by":[60,74],"this":[61,109,159],"insight,":[62],"our":[63,191],"work":[64,178],"focuses":[65],"on":[66,180],"systematically":[67],"enhancing":[68],"capabilities":[71],"of":[72,93,151],"establishing":[75],"code":[76,82,122,199],"powerful":[79],"medium--executable":[81],"provides":[83],"precise":[84],"semantics":[85],"naturally":[87],"align":[88],"with":[89],"structured":[91],"nature":[92],"visuals.":[95],"Specifically,":[96],"construct":[98],"ICC-1M,":[99],"large-scale":[101],"dataset":[102],"comprising":[103],"1M":[104],"Image-Caption-Code":[105],"triplets":[106],"materializes":[108],"code-as-perception":[110],"paradigm":[111],"through":[112,197],"two":[113],"complementary":[114],"approaches:":[115],"(1)":[116],"Code-Grounded":[117],"Caption":[118],"Generation":[119],"treats":[120],"executable":[121,198],"ground":[124],"truth":[125],"for":[126,154,201],"image":[127,202],"captions,":[128],"eliminating":[129],"hallucinations":[131],"inherent":[132],"in":[133,173],"existing":[134,177],"knowledge":[135],"distillation":[136],"methods;":[137],"(2)":[138],"Image-to-Code":[140],"Translation":[141],"prompts":[142],"models":[143],"generate":[145],"reconstruction":[146],"code,":[147],"mitigating":[148],"ambiguity":[150],"natural":[152],"language":[153],"enhancement.":[156],"To":[157],"validate":[158],"paradigm,":[160],"further":[162],"introduce":[163],"STEM2Code-Eval,":[164],"novel":[166],"benchmark":[167,192],"directly":[169],"evaluates":[170],"domains.":[175],"Unlike":[176],"relying":[179],"problem-solving":[181],"accuracy":[182],"proxy":[185],"only":[187],"measures":[188],"problem-relevant":[189],"understanding,":[190],"requires":[193],"comprehensive":[194],"comprehension":[196],"generation":[200],"reconstruction,":[203],"providing":[204],"deterministic":[205],"verifiable":[207],"assessment.":[208],"Code":[209],"available":[211],"https://github.com/TongkunGuan/Qwen-CodePercept.":[213]},"counts_by_year":[],"updated_date":"2026-03-13T14:25:03.468858","created_date":"2026-03-13T00:00:00"}
