{"id":"https://openalex.org/W7137182620","doi":"https://doi.org/10.48550/arxiv.2603.12829","title":"coDrawAgents: A Multi-Agent Dialogue Framework for Compositional Image Generation","display_name":"coDrawAgents: A Multi-Agent Dialogue Framework for Compositional Image Generation","publication_year":2026,"publication_date":"2026-03-13","ids":{"openalex":"https://openalex.org/W7137182620","doi":"https://doi.org/10.48550/arxiv.2603.12829"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.12829","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12829","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.12829","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129470102","display_name":"Chunhan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Chunhan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129536590","display_name":"Qifeng Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Qifeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023225646","display_name":"Jiahui Pan","orcid":"https://orcid.org/0000-0002-7576-6743"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Jia-Hui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102608394","display_name":"Ka-Hei Hui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hui, Ka-Hei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129576084","display_name":"Jingyu Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Jingyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129547832","display_name":"Yuming Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Yuming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129528339","display_name":"Bin Sheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sheng, Bin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129499244","display_name":"Xihui Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Xihui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008124931","display_name":"Wenjuan Gong","orcid":"https://orcid.org/0000-0001-7805-3629"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gong, Wenjuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129571921","display_name":"Zhengzhe Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zhengzhe","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5129470102"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7289000153541565,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7289000153541565,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.17100000381469727,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11574","display_name":"Artificial Intelligence in Games","score":0.016699999570846558,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6229000091552734},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5461000204086304},{"id":"https://openalex.org/keywords/interpreter","display_name":"Interpreter","score":0.5325999855995178},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5138999819755554},{"id":"https://openalex.org/keywords/planner","display_name":"Planner","score":0.460099995136261},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4462999999523163},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4415999948978424},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.39329999685287476}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8118000030517578},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6229000091552734},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5461000204086304},{"id":"https://openalex.org/C122783720","wikidata":"https://www.wikidata.org/wiki/Q183065","display_name":"Interpreter","level":2,"score":0.5325999855995178},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5138999819755554},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48840001225471497},{"id":"https://openalex.org/C2776999362","wikidata":"https://www.wikidata.org/wiki/Q2349274","display_name":"Planner","level":2,"score":0.460099995136261},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4462999999523163},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4415999948978424},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.39329999685287476},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.36579999327659607},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.35690000653266907},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.33000001311302185},{"id":"https://openalex.org/C205783811","wikidata":"https://www.wikidata.org/wiki/Q11629","display_name":"Painting","level":2,"score":0.3249000012874603},{"id":"https://openalex.org/C2775955345","wikidata":"https://www.wikidata.org/wiki/Q7449071","display_name":"Semantic mapping","level":2,"score":0.3215000033378601},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.314300000667572},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3084999918937683},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2994000017642975},{"id":"https://openalex.org/C2780580889","wikidata":"https://www.wikidata.org/wiki/Q41363","display_name":"Panorama","level":2,"score":0.2825999855995178},{"id":"https://openalex.org/C2779075496","wikidata":"https://www.wikidata.org/wiki/Q6898824","display_name":"Mondrian","level":3,"score":0.27799999713897705},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.27709999680519104},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.27549999952316284},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2678000032901764},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.26249998807907104}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.12829","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12829","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.12829","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12829","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Text-to-image":[0],"generation":[1],"has":[2],"advanced":[3],"rapidly,":[4],"but":[5],"existing":[6,205],"models":[7],"still":[8],"struggle":[9],"with":[10,30,79,103],"faithfully":[11],"composing":[12],"multiple":[13],"objects":[14,78,102,153],"and":[15,37,54,76,131,134,179,189,200],"preserving":[16],"their":[17],"attributes":[18],"in":[19,112,176],"complex":[20],"scenes.":[21],"We":[22],"propose":[23],"coDrawAgents,":[24],"an":[25,123],"interactive":[26],"multi-agent":[27,57],"dialogue":[28],"framework":[29],"four":[31],"specialized":[32],"agents:":[33],"Interpreter,":[34,91],"Planner,":[35],"Checker,":[36],"Painter":[38,143],"that":[39,192],"collaborate":[40],"to":[41,157,204],"improve":[42],"compositional":[43],"generation.":[44,87],"The":[45,120],"Interpreter":[46],"adaptively":[47],"decides":[48],"between":[49],"a":[50,55,95],"direct":[51],"text-to-image":[52],"pathway":[53],"layout-aware":[56,61],"process.":[58],"In":[59],"the":[60,65,80,90,92,104,113,118,142,145,155],"mode,":[62],"it":[63],"parses":[64],"prompt":[66],"into":[67,154],"attribute-rich":[68],"object":[69],"descriptors,":[70],"ranks":[71],"them":[72],"by":[73,89,127,148],"semantic":[74,82,106],"salience,":[75],"groups":[77],"same":[81,105],"priority":[83,107],"level":[84,108],"for":[85,101,161],"joint":[86],"Guided":[88],"Planner":[93],"adopts":[94],"divide-and-conquer":[96],"strategy,":[97],"incrementally":[98],"proposing":[99],"layouts":[100,136],"while":[109],"grounding":[110,174],"decisions":[111],"evolving":[114],"visual":[115,177],"context":[116,160],"of":[117],"canvas.":[119],"Checker":[121],"introduces":[122],"explicit":[124,181],"error-correction":[125],"mechanism":[126],"validating":[128],"spatial":[129,198],"consistency":[130],"attribute":[132,201],"alignment,":[133,197],"refining":[135],"before":[137],"they":[138],"are":[139],"rendered.":[140],"Finally,":[141],"synthesizes":[144],"image":[146],"step":[147],"step,":[149],"incorporating":[150],"newly":[151],"planned":[152],"canvas":[156],"provide":[158],"richer":[159],"subsequent":[162],"iterations.":[163],"Together,":[164],"these":[165],"agents":[166],"address":[167],"three":[168],"key":[169],"challenges:":[170],"reducing":[171],"layout":[172],"complexity,":[173],"planning":[175],"context,":[178],"enabling":[180],"error":[182],"correction.":[183],"Extensive":[184],"experiments":[185],"on":[186],"benchmarks":[187],"GenEval":[188],"DPG-Bench":[190],"demonstrate":[191],"coDrawAgents":[193],"substantially":[194],"improves":[195],"text-image":[196],"accuracy,":[199],"binding":[202],"compared":[203],"methods.":[206]},"counts_by_year":[],"updated_date":"2026-03-17T07:05:13.627479","created_date":"2026-03-17T00:00:00"}
