{"id":"https://openalex.org/W7148248113","doi":"https://doi.org/10.48550/arxiv.2604.00849","title":"Disentangling to Re-couple: Resolving the Similarity-Controllability Paradox in Subject-Driven Text-to-Image Generation","display_name":"Disentangling to Re-couple: Resolving the Similarity-Controllability Paradox in Subject-Driven Text-to-Image Generation","publication_year":2026,"publication_date":"2026-04-01","ids":{"openalex":"https://openalex.org/W7148248113","doi":"https://doi.org/10.48550/arxiv.2604.00849"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.00849","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00849","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.00849","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132817418","display_name":"Shuang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Shuang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132804369","display_name":"Chao Deng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Deng, Chao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132808560","display_name":"Hang Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Hang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132804910","display_name":"Liqun Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Liqun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132828599","display_name":"Zhenyu Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Zhenyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128645827","display_name":"Te Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Te","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128657706","display_name":"Mengge Xue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Mengge","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132801179","display_name":"Yuan Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132800449","display_name":"Peng Shu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shu, Peng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132810282","display_name":"Huan Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Huan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132790597","display_name":"Jie Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Jie","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5132817418"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.8547999858856201,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.8547999858856201,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12377","display_name":"Digital Humanities and Scholarship","score":0.019899999722838402,"subfield":{"id":"https://openalex.org/subfields/1208","display_name":"Literature and Literary Theory"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.019600000232458115,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/subject","display_name":"Subject (documents)","score":0.7968000173568726},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6223999857902527},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.5806000232696533},{"id":"https://openalex.org/keywords/decoupling","display_name":"Decoupling (probability)","score":0.5472999811172485},{"id":"https://openalex.org/keywords/text-generation","display_name":"Text generation","score":0.47119998931884766},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.44830000400543213},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.4009000062942505},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.39309999346733093}],"concepts":[{"id":"https://openalex.org/C2777855551","wikidata":"https://www.wikidata.org/wiki/Q12310021","display_name":"Subject (documents)","level":2,"score":0.7968000173568726},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7657999992370605},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6223999857902527},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.5806000232696533},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5756999850273132},{"id":"https://openalex.org/C205606062","wikidata":"https://www.wikidata.org/wiki/Q5249645","display_name":"Decoupling (probability)","level":2,"score":0.5472999811172485},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.47130000591278076},{"id":"https://openalex.org/C2985684807","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Text generation","level":2,"score":0.47119998931884766},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.44830000400543213},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.4009000062942505},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.39309999346733093},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.3650999963283539},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.3407000005245209},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.3294999897480011},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.27239999175071716},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2696000039577484},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.263700008392334},{"id":"https://openalex.org/C2983812711","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Text recognition","level":3,"score":0.2583000063896179},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.251800000667572}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.00849","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00849","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.00849","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00849","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.5255326628684998,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Subject-Driven":[0],"Text-to-Image":[1],"(T2I)":[2],"Generation":[3],"aims":[4],"to":[5,66,123,133,145,167],"preserve":[6],"a":[7,16,78,159],"subject's":[8,36],"identity":[9,102],"while":[10,117],"editing":[11],"its":[12,152],"context":[13],"based":[14],"on":[15],"text":[17,50,119],"prompt.":[18],"A":[19],"core":[20],"challenge":[21],"in":[22],"this":[23,42,140,156],"task":[24],"is":[25,103,121],"the":[26,35,46,59,62,70,97,107,111,115,118,126,130,149,170,174,181],"\"similarity-controllability":[27],"paradox\",":[28],"where":[29,129],"enhancing":[30],"textual":[31,89,190],"control":[32],"often":[33,54],"degrades":[34],"fidelity,":[37],"and":[38,61,84,88,151,163,173,188,204],"vice-versa.":[39],"We":[40,154],"argue":[41],"paradox":[43],"stems":[44],"from":[45,106],"ambiguous":[47],"role":[48],"of":[49,99,114],"prompts,":[51],"which":[52],"are":[53],"tasked":[55],"with":[56,110],"describing":[57],"both":[58],"subject":[60,101,131,150,172,186],"desired":[63],"modifications,":[64],"leading":[65],"conflicting":[67],"signals":[68],"for":[69],"model.":[71],"To":[72],"resolve":[73],"this,":[74],"we":[75],"propose":[76],"DisCo,":[77],"novel":[79],"framework":[80],"that":[81,195],"first":[82],"Disntangles":[83],"then":[85],"re-Couples":[86],"visual":[87],"information.":[90],"First,":[91],"our":[92,196],"textual-visual":[93],"decoupling":[94],"module":[95],"isolates":[96],"sources":[98],"information:":[100],"extracted":[104],"exclusively":[105],"reference":[108],"image":[109],"entity":[112],"word":[113],"subject,":[116],"prompt":[120],"simplified":[122],"contain":[124],"only":[125],"modification":[127],"command,":[128],"refers":[132],"general":[134],"pronouns,":[135],"eliminating":[136],"descriptive":[137],"ambiguity.":[138],"However,":[139],"strict":[141],"separation":[142],"can":[143],"lead":[144],"unnatural":[146],"compositions":[147],"between":[148],"contexts.":[153],"address":[155],"by":[157],"designing":[158],"dedicated":[160],"reward":[161],"signal":[162],"using":[164],"reinforcement":[165],"learning":[166],"seamlessly":[168],"recouple":[169],"visually-defined":[171],"textually-generated":[175],"context.":[176],"Our":[177],"approach":[178],"effectively":[179],"resolves":[180],"paradox,":[182],"enabling":[183],"simultaneous":[184],"high-fidelity":[185],"preservation":[187],"precise":[189],"control.":[191],"Extensive":[192],"experiments":[193],"demonstrate":[194],"method":[197],"achieves":[198],"state-of-the-art":[199],"performance,":[200],"producing":[201],"highly":[202],"realistic":[203],"coherent":[205],"images.":[206]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
