{"id":"https://openalex.org/W4415537217","doi":"https://doi.org/10.1145/3746027.3755731","title":"Draw with Thought: Unleashing Multimodal Reasoning for Scientific Diagram Generation","display_name":"Draw with Thought: Unleashing Multimodal Reasoning for Scientific Diagram Generation","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415537217","doi":"https://doi.org/10.1145/3746027.3755731"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755731","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755731","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5105097814","display_name":"Zhiqing Cui","orcid":null},"institutions":[{"id":"https://openalex.org/I200845125","display_name":"Nanjing University of Information Science and Technology","ror":"https://ror.org/02y0rxk19","country_code":"CN","type":"education","lineage":["https://openalex.org/I200845125"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhiqing Cui","raw_affiliation_strings":["Nanjing University of Information Science &amp; Technology, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University of Information Science &amp; Technology, Nanjing, China","institution_ids":["https://openalex.org/I200845125"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120131456","display_name":"Jiahao Yuan","orcid":null},"institutions":[{"id":"https://openalex.org/I66867065","display_name":"East China Normal University","ror":"https://ror.org/02n96ep67","country_code":"CN","type":"education","lineage":["https://openalex.org/I66867065"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiahao Yuan","raw_affiliation_strings":["East China Normal University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"East China Normal University, Shanghai, China","institution_ids":["https://openalex.org/I66867065"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057232450","display_name":"H. Q. Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hanqing Wang","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yanshu Li","orcid":"https://orcid.org/0009-0002-4054-606X"},"institutions":[{"id":"https://openalex.org/I27804330","display_name":"Brown University","ror":"https://ror.org/05gq02987","country_code":"US","type":"education","lineage":["https://openalex.org/I27804330"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yanshu Li","raw_affiliation_strings":["Brown University, Providence, USA"],"affiliations":[{"raw_affiliation_string":"Brown University, Providence, USA","institution_ids":["https://openalex.org/I27804330"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119989772","display_name":"Chenxu Du","orcid":null},"institutions":[{"id":"https://openalex.org/I4800084","display_name":"Southwest Jiaotong University","ror":"https://ror.org/00hn7w693","country_code":"CN","type":"education","lineage":["https://openalex.org/I4800084"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chenxu Du","raw_affiliation_strings":["Southwest Jiaotong University, Chengdu, China"],"affiliations":[{"raw_affiliation_string":"Southwest Jiaotong University, Chengdu, China","institution_ids":["https://openalex.org/I4800084"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101441188","display_name":"Zhenglong Ding","orcid":"https://orcid.org/0000-0002-2949-6287"},"institutions":[{"id":"https://openalex.org/I200845125","display_name":"Nanjing University of Information Science and Technology","ror":"https://ror.org/02y0rxk19","country_code":"CN","type":"education","lineage":["https://openalex.org/I200845125"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenglong Ding","raw_affiliation_strings":["Nanjing University of Information Science &amp; Technology, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University of Information Science &amp; Technology, Nanjing, China","institution_ids":["https://openalex.org/I200845125"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5105097814"],"corresponding_institution_ids":["https://openalex.org/I200845125"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.1599404,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"5050","last_page":"5059"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9900000095367432,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9900000095367432,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13083","display_name":"Advanced Text Analysis Techniques","score":0.9718999862670898,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.965399980545044,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.522599995136261},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.4999000132083893},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3756999969482422},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.36039999127388},{"id":"https://openalex.org/keywords/xml","display_name":"XML","score":0.35690000653266907},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.3474999964237213},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.3361000120639801},{"id":"https://openalex.org/keywords/visual-language","display_name":"Visual language","score":0.3310000002384186}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8080999851226807},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.522599995136261},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.4999000132083893},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.4259999990463257},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3815999925136566},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3756999969482422},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.36039999127388},{"id":"https://openalex.org/C8797682","wikidata":"https://www.wikidata.org/wiki/Q2115","display_name":"XML","level":2,"score":0.35690000653266907},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.3474999964237213},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3361000120639801},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.3310000002384186},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3257000148296356},{"id":"https://openalex.org/C161301231","wikidata":"https://www.wikidata.org/wiki/Q3478658","display_name":"Knowledge representation and reasoning","level":2,"score":0.2985000014305115},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.288100004196167},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2854999899864197},{"id":"https://openalex.org/C2775945657","wikidata":"https://www.wikidata.org/wiki/Q381442","display_name":"Structuring","level":2,"score":0.2808000147342682},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2685999870300293},{"id":"https://openalex.org/C145644426","wikidata":"https://www.wikidata.org/wiki/Q169411","display_name":"Unified Modeling Language","level":3,"score":0.265500009059906},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.26330000162124634},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.25600001215934753},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.2547000050544739},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2535000145435333},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.25029999017715454}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755731","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755731","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G8167056232","display_name":null,"funder_award_id":"No. 62303231","funder_id":"https://openalex.org/F4320336806","funder_display_name":"National Natural Science Foundation of China - State Grid Corporation Joint Fund for Smart Grid"}],"funders":[{"id":"https://openalex.org/F4320336806","display_name":"National Natural Science Foundation of China - State Grid Corporation Joint Fund for Smart Grid","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":7,"referenced_works":["https://openalex.org/W261871827","https://openalex.org/W1992524617","https://openalex.org/W2474143829","https://openalex.org/W3159481202","https://openalex.org/W4200524251","https://openalex.org/W4396704947","https://openalex.org/W4403792079"],"related_works":[],"abstract_inverted_index":{"Scientific":[0],"diagrams":[1,66,122],"are":[2,13],"vital":[3],"tools":[4],"for":[5,160],"communicating":[6],"structured":[7],"knowledge":[8],"across":[9,129],"disciplines.":[10],"However,":[11],"they":[12],"often":[14],"published":[15],"as":[16],"static":[17,162],"raster":[18],"images,":[19],"losing":[20],"symbolic":[21],"semantics":[22],"and":[23,38,45,80,99,102,140,153,167,170],"limiting":[24],"reuse.":[25],"While":[26],"Multimodal":[27],"Large":[28],"Language":[29],"Models":[30],"(MLLMs)":[31],"offer":[32],"a":[33,58,116,157],"pathway":[34],"to":[35,64],"bridging":[36],"vision":[37],"structure,":[39],"existing":[40],"methods":[41],"lack":[42],"semantic":[43,100],"control":[44],"structural":[46],"interpretability,":[47],"especially":[48],"on":[49],"complex":[50],"diagrams.":[51],"We":[52],"propose":[53],"Draw":[54],"with":[55,123,144],"Thought":[56],"(DwT),":[57],"training-free":[59],"framework":[60],"that":[61,133],"guides":[62],"MLLMs":[63,131],"reconstruct":[65],"into":[67,90,164],"editable":[68],"mxGraph":[69],"XML":[70,125],"code":[71],"through":[72],"cognitively":[73],"inspired":[74],"Chain-of-Thought":[75],"reasoning.":[76],"DwT":[77],"enables":[78],"interpretable":[79],"controllable":[81],"outputs":[82],"without":[83],"model":[84],"fine-tuning":[85],"by":[86,107],"dividing":[87],"the":[88],"task":[89],"two":[91],"stages:":[92],"Coarse-to-Fine":[93],"Planning,":[94],"which":[95],"handles":[96],"perceptual":[97],"structuring":[98],"specification,":[101],"Structure-Aware":[103],"Code":[104],"Generation,":[105],"enhanced":[106],"format-guided":[108],"refinement.":[109],"To":[110],"support":[111],"evaluation,":[112],"we":[113],"release":[114],"Plot2XML,":[115],"benchmark":[117],"of":[118,174],"247":[119],"real-world":[120],"scientific":[121,175],"gold-standard":[124],"annotations.":[126],"Extensive":[127],"experiments":[128],"eight":[130],"show":[132],"our":[134],"approach":[135],"yields":[136],"high-fidelity,":[137],"semantically":[138],"aligned,":[139],"structurally":[141,165],"valid":[142,166],"reconstructions,":[143],"human":[145],"evaluations":[146],"confirming":[147],"strong":[148],"alignment":[149],"in":[150],"both":[151],"accuracy":[152],"visual":[154],"aesthetics,":[155],"offering":[156],"scalable":[158],"solution":[159],"converting":[161],"visuals":[163],"renderable":[168],"representations":[169],"advancing":[171],"machine":[172],"understanding":[173],"graphics.":[176]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-25T00:00:00"}
