{"id":"https://openalex.org/W7154589222","doi":"https://doi.org/10.14428/esann/2026.es2026-372","title":"Code-Guided Reasoning in Vision-Language Models for Complex Diagram Understanding","display_name":"Code-Guided Reasoning in Vision-Language Models for Complex Diagram Understanding","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7154589222","doi":"https://doi.org/10.14428/esann/2026.es2026-372"},"language":null,"primary_location":{"id":"doi:10.14428/esann/2026.es2026-372","is_oa":true,"landing_page_url":"https://doi.org/10.14428/esann/2026.es2026-372","pdf_url":"https://doi.org/10.14428/esann/2026.es2026-372","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ESANN 2026 proceesdings","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.14428/esann/2026.es2026-372","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133743380","display_name":"Daniel Steinigen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Daniel Steinigen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133737623","display_name":"Lucie Flek","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lucie Flek","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133776747","display_name":"Sebastian Houben","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sebastian Houben","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.57180276,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"93","last_page":"98"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11596","display_name":"Constraint Satisfaction and Optimization","score":0.37700000405311584,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11596","display_name":"Constraint Satisfaction and Optimization","score":0.37700000405311584,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.12710000574588776,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11450","display_name":"Model-Driven Software Engineering Techniques","score":0.05889999866485596,"subfield":{"id":"https://openalex.org/subfields/1712","display_name":"Software"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.3499000072479248},{"id":"https://openalex.org/keywords/diagram","display_name":"Diagram","score":0.30379998683929443},{"id":"https://openalex.org/keywords/interpretation","display_name":"Interpretation (philosophy)","score":0.2858999967575073},{"id":"https://openalex.org/keywords/automated-reasoning","display_name":"Automated reasoning","score":0.25369998812675476},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.2535000145435333}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5580000281333923},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37459999322891235},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.3499000072479248},{"id":"https://openalex.org/C186399060","wikidata":"https://www.wikidata.org/wiki/Q959962","display_name":"Diagram","level":2,"score":0.30379998683929443},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.29600000381469727},{"id":"https://openalex.org/C527412718","wikidata":"https://www.wikidata.org/wiki/Q855395","display_name":"Interpretation (philosophy)","level":2,"score":0.2858999967575073},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.28119999170303345},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.25369998812675476},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.2535000145435333},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2515999972820282}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.14428/esann/2026.es2026-372","is_oa":true,"landing_page_url":"https://doi.org/10.14428/esann/2026.es2026-372","pdf_url":"https://doi.org/10.14428/esann/2026.es2026-372","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ESANN 2026 proceesdings","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.14428/esann/2026.es2026-372","is_oa":true,"landing_page_url":"https://doi.org/10.14428/esann/2026.es2026-372","pdf_url":"https://doi.org/10.14428/esann/2026.es2026-372","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ESANN 2026 proceesdings","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2603180164","display_name":null,"funder_award_id":"IIS-2143529","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G3936121087","display_name":"CAREER: Fostering Prosocial Behavior and Well-Being in Online Communities","funder_award_id":"2143529","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7154589222.pdf","grobid_xml":"https://content.openalex.org/works/W7154589222.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Understanding":[0],"complex":[1,160],"structured":[2],"diagrams,":[3],"such":[4,28],"as":[5],"circuit":[6],"schematics,":[7],"molecular":[8],"structures,":[9],"musical":[10],"notation,":[11],"or":[12],"business":[13],"process":[14],"models,":[15],"requires":[16],"precise":[17],"symbolic,":[18],"spatial,":[19],"and":[20,93],"relational":[21],"reasoning.Current":[22],"vision-language":[23],"models":[24,83,123],"(VLMs)":[25],"struggle":[26],"with":[27,108,167],"tasks":[29],"because":[30],"they":[31],"lack":[32],"access":[33],"to":[34,52,84,114,158],"the":[35,60,102,109,112,116],"underlying":[36],"symbolic":[37,57,117,132,146,168],"structure":[38],"that":[39,62,122,144],"governs":[40],"these":[41],"diagrams.We":[42],"introduce":[43],"a":[44,68,152],"training":[45,150],"paradigm":[46],"in":[47,65,87],"which":[48],"VLMs":[49],"explicitly":[50],"learn":[51],"reason":[53],"through":[54],"an":[55],"intermediate":[56,133],"representation":[58,90,118],"of":[59,125],"image":[61],"is":[63,105],"expressed":[64],"code.We":[66],"generate":[67,85],"large":[69,81],"synthetic":[70],"dataset":[71],"covering":[72],"21":[73],"diagram":[74,139,165],"types":[75],"across":[76],"7":[77],"domains":[78],"by":[79,163],"prompting":[80],"language":[82],"code":[86,104,128,147],"specific":[88],"formal":[89],"languages":[91],"(FRLs)":[92],"rendering":[94],"them":[95],"into":[96,148],"paired":[97],"code-image":[98],"samples.During":[99],"VLM":[100,149,156],"training,":[101],"FRL":[103],"provided":[106],"along":[107],"image,":[110],"enabling":[111],"model":[113],"incorporate":[115],"during":[119],"reasoning.Experiments":[120],"show":[121],"capable":[124],"producing":[126],"valid":[127],"benefit":[129],"from":[130],"this":[131],"layer,":[134],"yielding":[135],"improved":[136],"accuracy":[137],"on":[138],"understanding":[140],"tasks.Our":[141],"results":[142],"demonstrate":[143],"integrating":[145],"offers":[151],"promising":[153],"direction":[154],"for":[155],"design":[157],"handle":[159],"visual":[161],"data":[162],"bridging":[164],"perception":[166],"reasoning.":[169]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-17T00:00:00"}
