{"id":"https://openalex.org/W4415937508","doi":"https://doi.org/10.48550/arxiv.2511.00391","title":"VinciCoder: Unifying Multimodal Code Generation via Coarse-to-fine Visual Reinforcement Learning","display_name":"VinciCoder: Unifying Multimodal Code Generation via Coarse-to-fine Visual Reinforcement Learning","publication_year":2025,"publication_date":"2025-11-01","ids":{"openalex":"https://openalex.org/W4415937508","doi":"https://doi.org/10.48550/arxiv.2511.00391"},"language":null,"primary_location":{"id":"pmh:oai:arXiv.org:2511.00391","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.00391","pdf_url":"https://arxiv.org/pdf/2511.00391","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2511.00391","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5104174550","display_name":"Xuanle Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhao, Xuanle","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102575213","display_name":"D. A. Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Deyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101581937","display_name":"Zhixiong Zeng","orcid":"https://orcid.org/0000-0002-3822-1074"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeng, Zhixiong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100689319","display_name":"Lei Chen","orcid":"https://orcid.org/0000-0002-2269-2912"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Lei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030346569","display_name":"Haibo Qiu","orcid":"https://orcid.org/0000-0001-8589-4717"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiu, Haibo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023965104","display_name":"Jing Huang","orcid":"https://orcid.org/0000-0001-8704-154X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Jing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035189764","display_name":"Yufeng Zhong","orcid":"https://orcid.org/0000-0003-2253-1497"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhong, Yufeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037151071","display_name":"Liming Zheng","orcid":"https://orcid.org/0000-0003-3179-2162"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Liming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100537074","display_name":"Yilin Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Yilin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100324185","display_name":"Lin Ma","orcid":"https://orcid.org/0000-0002-9810-956X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Lin","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5104174550"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.4977000057697296,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.4977000057697296,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.19769999384880066,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.05939999967813492,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.6858999729156494},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.6324999928474426},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.6245999932289124},{"id":"https://openalex.org/keywords/code-generation","display_name":"Code generation","score":0.5062999725341797},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.37130001187324524},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.3255000114440918}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.796500027179718},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.6858999729156494},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.6324999928474426},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.6245999932289124},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5805000066757202},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.5062999725341797},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4138999879360199},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.37130001187324524},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.3255000114440918},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.29339998960494995},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.28519999980926514},{"id":"https://openalex.org/C150292731","wikidata":"https://www.wikidata.org/wiki/Q1342704","display_name":"Code review","level":5,"score":0.2799000144004822},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.2615000009536743},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.257099986076355}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2511.00391","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.00391","pdf_url":"https://arxiv.org/pdf/2511.00391","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2511.00391","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2511.00391","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2511.00391","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.00391","pdf_url":"https://arxiv.org/pdf/2511.00391","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"code":[1,54,84,88,124,152],"generation":[2,55,85,125],"has":[3],"garnered":[4],"significant":[5],"interest":[6],"within":[7],"the":[8,12,38,142],"research":[9],"community.":[10],"Despite":[11],"notable":[13],"success":[14],"of":[15,40,144],"recent":[16,134],"vision-language":[17],"models":[18],"(VLMs)":[19],"on":[20,28,121],"specialized":[21],"tasks":[22,81],"like":[23],"chart-to-code":[24],"generation,":[25],"their":[26],"reliance":[27],"single-task":[29],"training":[30,64],"regimens":[31],"fosters":[32],"a":[33,51,62,70,93,101],"narrow":[34],"paradigm":[35],"that":[36,57,128],"hinders":[37],"development":[39],"generalized":[41],"\\textbf{VI}sio\\textbf{N}":[42],"\\textbf{C}ode":[43],"\\textbf{I}ntelligence.":[44],"In":[45],"this":[46,59],"work,":[47],"we":[48,91],"introduce":[49,92],"\\textbf{VinciCoder},":[50],"unified":[52],"multimodal":[53,123],"model":[56,154],"addresses":[58],"limitation":[60],"via":[61],"two-stage":[63],"framework.":[65],"We":[66],"begin":[67],"by":[68,109],"constructing":[69],"large-scale":[71],"Supervised":[72],"Finetuning":[73],"(SFT)":[74],"corpus":[75],"comprising":[76],"1.6M":[77],"image-code":[78],"pairs":[79],"for":[80],"involving":[82],"direct":[83],"and":[86,115,153],"visual-based":[87],"refinement.":[89],"Subsequently,":[90],"Visual":[94],"Reinforcement":[95],"Learning":[96],"(ViRL)":[97],"strategy,":[98],"which":[99],"employs":[100],"coarse-to-fine":[102,147],"reward":[103],"mechanism":[104],"to":[105],"improve":[106],"visual":[107,111],"fidelity":[108],"calculating":[110],"similarity":[112],"across":[113],"local":[114],"global":[116],"image":[117],"patches.":[118],"Extensive":[119],"experiments":[120],"diverse":[122],"benchmarks":[126],"demonstrate":[127],"VinciCoder":[129],"achieves":[130],"state-of-the-art":[131],"performance,":[132],"surpassing":[133],"open-source":[135],"models.":[136],"The":[137,150],"ablation":[138],"study":[139],"further":[140],"validates":[141],"effectiveness":[143],"our":[145],"proposed":[146],"ViRL":[148],"strategy.":[149],"data,":[151],"is":[155],"available":[156],"at":[157],"https://github.com/DocTron-hub/VinciCoder.":[158]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-05T00:00:00"}
