{"id":"https://openalex.org/W7162627097","doi":"https://doi.org/10.48550/arxiv.2605.28615","title":"Compositional Text-to-Image Generation Via Region-aware Bimodal Direct Preference Optimization","display_name":"Compositional Text-to-Image Generation Via Region-aware Bimodal Direct Preference Optimization","publication_year":2026,"publication_date":"2026-05-27","ids":{"openalex":"https://openalex.org/W7162627097","doi":"https://doi.org/10.48550/arxiv.2605.28615"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.28615","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28615","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.28615","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5127833141","display_name":"Zhuohan Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zhuohan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101798999","display_name":"Wujian Peng","orcid":"https://orcid.org/0009-0001-6428-276X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Wujian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137257348","display_name":"Yitong Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yitong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137274393","display_name":"Zuxuan Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Zuxuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.8511000275611877,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.8511000275611877,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.042899999767541885,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10719","display_name":"3D Shape Modeling and Analysis","score":0.01510000042617321,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.633400022983551},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.6069999933242798},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.574999988079071},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.477400004863739},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.4641000032424927},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4514999985694885}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7164999842643738},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.633400022983551},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.6069999933242798},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.574999988079071},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48260000348091125},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.477400004863739},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4641000032424927},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4514999985694885},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.43540000915527344},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.39730000495910645},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.33329999446868896},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.31540000438690186},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.3052999973297119},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.2962999939918518},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2955999970436096},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.26339998841285706},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.26330000162124634},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.251800000667572}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.28615","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28615","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.28615","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28615","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Despite":[0],"the":[1,81,93,132],"rapid":[2],"progress":[3],"of":[4,38,134],"text-to-image":[5,40,139],"(T2I)":[6],"models,":[7],"generating":[8],"images":[9],"that":[10,115],"accurately":[11],"reflect":[12],"complex":[13,85,138],"compositional":[14,39,110,120],"prompts":[15],"(covering":[16],"attribute":[17],"bindings,":[18],"object":[19],"relationships,":[20],"counting)":[21],"still":[22],"remains":[23],"challenging.":[24],"To":[25,90],"address":[26],"this,":[27],"we":[28,62,98],"propose":[29],"BiDPO,":[30],"a":[31,52,100,142],"framework":[32],"to":[33,50,66,76,83,104,109,147],"enhance":[34,92],"T2I":[35],"model's":[36],"capability":[37],"generation.":[41,89],"We":[42],"begin":[43],"by":[44],"introducing":[45],"an":[46],"carefully":[47],"designed":[48],"pipeline":[49],"construct":[51],"large-scale":[53],"preference":[54],"dataset,":[55],"BiComp,":[56],"with":[57],"strictly":[58],"quality":[59],"control.":[60],"Then,":[61],"extend":[63],"Diffusion":[64],"DPO":[65],"jointly":[67],"optimize":[68],"image":[69],"and":[70,144],"text":[71,86],"preferences,":[72],"which":[73],"is":[74],"shown":[75],"greatly":[77],"effective":[78],"in":[79,88],"improving":[80],"models":[82,94],"follow":[84],"prompt":[87],"further":[91],"for":[95,137],"fine-grained":[96],"alignment,":[97],"employ":[99],"region-level":[101],"guidance":[102],"method":[103],"focus":[105],"on":[106],"regions":[107],"relevant":[108],"concepts.":[111],"Experimental":[112],"results":[113],"demonstrate":[114],"our":[116],"BiDPO":[117],"substantially":[118],"improves":[119],"fidelity,":[121],"consistently":[122],"outperforming":[123],"prior":[124],"methods":[125],"across":[126],"multiple":[127],"benchmarks.":[128],"Our":[129],"approach":[130],"highlights":[131],"potential":[133],"preference-based":[135],"fine-tuning":[136],"tasks,":[140],"offering":[141],"flexible":[143],"scalable":[145],"alternative":[146],"existing":[148],"techniques.":[149]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-29T00:00:00"}
