{"id":"https://openalex.org/W7147685267","doi":"https://doi.org/10.48550/arxiv.2603.29620","title":"Unify-Agent: A Unified Multimodal Agent for World-Grounded Image Synthesis","display_name":"Unify-Agent: A Unified Multimodal Agent for World-Grounded Image Synthesis","publication_year":2026,"publication_date":"2026-03-31","ids":{"openalex":"https://openalex.org/W7147685267","doi":"https://doi.org/10.48550/arxiv.2603.29620"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.29620","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29620","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.29620","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132602068","display_name":"Shuang Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Shuang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106748272","display_name":"Q. Y. Shou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shou, Quanxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048946763","display_name":"Hangting Chen","orcid":"https://orcid.org/0000-0002-4085-4364"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Hangting","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132680447","display_name":"Yucheng Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Yucheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132658801","display_name":"Kaituo Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Kaituo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132685131","display_name":"Wenbo Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Wenbo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132688218","display_name":"Yi-Fan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yi-Fan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132698664","display_name":"Yunlong Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Yunlong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132699302","display_name":"Wenxuan Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Wenxuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132681832","display_name":"Mingyang Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Mingyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010781772","display_name":"D. Z. Dai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Dasen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132571396","display_name":"Bolin Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Bolin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132564246","display_name":"Manyuan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Manyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132635689","display_name":"Shi-Xue Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Shi-Xue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101637347","display_name":"Zhengkai Jiang","orcid":"https://orcid.org/0000-0003-4064-994X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Zhengkai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030165591","display_name":"Lucas Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Lucas","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132648651","display_name":"Zhao Zhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhong, Zhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132663916","display_name":"Yu Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Yu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132590179","display_name":"Nanyun Peng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Nanyun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":19,"corresponding_author_ids":["https://openalex.org/A5132602068"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.8449000120162964,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.8449000120162964,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.0778999999165535,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.006399999838322401,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6467999815940857},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5200999975204468},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4788999855518341},{"id":"https://openalex.org/keywords/unified-model","display_name":"Unified Model","score":0.46650001406669617},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.41990000009536743},{"id":"https://openalex.org/keywords/knowledge-base","display_name":"Knowledge base","score":0.35040000081062317},{"id":"https://openalex.org/keywords/parametric-statistics","display_name":"Parametric statistics","score":0.31060001254081726},{"id":"https://openalex.org/keywords/image-editing","display_name":"Image editing","score":0.3068000078201294}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6869999766349792},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6467999815940857},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5200999975204468},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4936999976634979},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4788999855518341},{"id":"https://openalex.org/C45493050","wikidata":"https://www.wikidata.org/wiki/Q7884934","display_name":"Unified Model","level":2,"score":0.46650001406669617},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.41990000009536743},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.35040000081062317},{"id":"https://openalex.org/C117251300","wikidata":"https://www.wikidata.org/wiki/Q1849855","display_name":"Parametric statistics","level":2,"score":0.31060001254081726},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.3068000078201294},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.2962999939918518},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.28839999437332153},{"id":"https://openalex.org/C24574437","wikidata":"https://www.wikidata.org/wiki/Q7135228","display_name":"Parametric model","level":3,"score":0.2793000042438507},{"id":"https://openalex.org/C2989087649","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Image synthesis","level":3,"score":0.2703000009059906},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.26660001277923584},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.2587999999523163},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.25850000977516174},{"id":"https://openalex.org/C2780980858","wikidata":"https://www.wikidata.org/wiki/Q110022","display_name":"Dual (grammatical number)","level":2,"score":0.2549999952316284}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.29620","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29620","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.29620","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29620","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/11","display_name":"Sustainable cities and communities","score":0.49250873923301697}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Unified":[0],"multimodal":[1,66,84,100],"models":[2],"provide":[3],"a":[4,64,98,126],"natural":[5],"and":[6,12,39,89,103,134,161,198],"promising":[7],"architecture":[8],"for":[9,68,109,184,200],"understanding":[10],"diverse":[11,159],"complex":[13],"real-world":[14,34,50],"knowledge":[15,142,170],"while":[16,166],"generating":[17],"high-quality":[18,106],"images.":[19],"However,":[20],"they":[21],"still":[22],"rely":[23],"primarily":[24],"on":[25,49],"frozen":[26],"parametric":[27],"knowledge,":[28],"which":[29,72],"makes":[30],"them":[31],"struggle":[32],"with":[33],"image":[35,70,74,111,186,204],"generation":[36,75,120,164,199],"involving":[37],"long-tail":[38,135],"knowledge-intensive":[40],"concepts.":[41],"Inspired":[42],"by":[43],"the":[44,117,168,173,191],"broad":[45],"success":[46],"of":[47,81,131,172,181,193],"agents":[48],"tasks,":[51,165],"we":[52,61,96],"explore":[53],"agentic":[54,78,119,203],"modeling":[55,183],"to":[56],"address":[57],"this":[58],"limitation.":[59],"Specifically,":[60],"present":[62],"Unify-Agent,":[63],"unified":[65,156],"agent":[67,107],"world-grounded":[69,110,185],"synthesis,":[71,112,187],"reframes":[73],"as":[76],"an":[77,178],"pipeline":[79,102],"consisting":[80],"prompt":[82],"understanding,":[83],"evidence":[85],"searching,":[86,197],"grounded":[87],"recaptioning,":[88],"final":[90],"synthesis.":[91,205],"To":[92],"train":[93],"our":[94,148,188],"model,":[95],"construct":[97],"tailored":[99],"data":[101],"curate":[104],"143K":[105],"trajectories":[108],"enabling":[113],"effective":[114],"supervision":[115],"over":[116,153],"full":[118],"process.":[121],"We":[122],"further":[123],"introduce":[124],"FactIP,":[125],"benchmark":[127],"covering":[128],"12":[129],"categories":[130],"culturally":[132],"significant":[133],"factual":[136],"concepts":[137],"that":[138,147],"explicitly":[139],"requires":[140],"external":[141],"grounding.":[143],"Extensive":[144],"experiments":[145],"show":[146],"proposed":[149],"Unify-Agent":[150],"substantially":[151],"improves":[152],"its":[154],"base":[155],"model":[157],"across":[158],"benchmarks":[160],"real":[162],"world":[163,169],"approaching":[167],"capabilities":[171],"strongest":[174],"closed-source":[175],"models.":[176],"As":[177],"early":[179],"exploration":[180],"agent-based":[182],"work":[189],"highlights":[190],"value":[192],"tightly":[194],"coupling":[195],"reasoning,":[196],"reliable":[201],"open-world":[202]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-04-02T00:00:00"}
