{"id":"https://openalex.org/W7162498070","doi":"https://doi.org/10.48550/arxiv.2605.27235","title":"MRT: Masked Region Transformer for Layered Image Generation and Editing at Scale","display_name":"MRT: Masked Region Transformer for Layered Image Generation and Editing at Scale","publication_year":2026,"publication_date":"2026-05-26","ids":{"openalex":"https://openalex.org/W7162498070","doi":"https://doi.org/10.48550/arxiv.2605.27235"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.27235","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27235","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.27235","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058354174","display_name":"Zhicong Tang","orcid":"https://orcid.org/0009-0000-0949-4009"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Zhicong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137153374","display_name":"Zhao Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137151745","display_name":"Jingye Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jingye","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137086069","display_name":"Mohan Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Mohan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137113702","display_name":"Yifan Pu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pu, Yifan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137166943","display_name":"Yuchi Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yuchi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054533256","display_name":"Yalong Bai","orcid":"https://orcid.org/0000-0002-8416-9027"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Yalong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137145940","display_name":"Ethan Smith","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Smith, Ethan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137103777","display_name":"Yuhui Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Yuhui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.5527999997138977,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.5527999997138977,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10481","display_name":"Computer Graphics and Visualization Techniques","score":0.062199998646974564,"subfield":{"id":"https://openalex.org/subfields/1704","display_name":"Computer Graphics and Computer-Aided Design"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12377","display_name":"Digital Humanities and Scholarship","score":0.03790000081062317,"subfield":{"id":"https://openalex.org/subfields/1208","display_name":"Literature and Literary Theory"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/image-editing","display_name":"Image editing","score":0.6656000018119812},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5530999898910522},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.48330000042915344},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.45980000495910645},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.42260000109672546},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4153999984264374},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4018000066280365}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8167999982833862},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.6656000018119812},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5530999898910522},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.48330000042915344},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.45980000495910645},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.42260000109672546},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4153999984264374},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4018000066280365},{"id":"https://openalex.org/C55020928","wikidata":"https://www.wikidata.org/wiki/Q3813865","display_name":"Image quality","level":3,"score":0.388700008392334},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38510000705718994},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.37290000915527344},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.37220001220703125},{"id":"https://openalex.org/C163173736","wikidata":"https://www.wikidata.org/wiki/Q3308558","display_name":"Key generation","level":3,"score":0.34599998593330383},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.3303000032901764},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.329800009727478},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3098999857902527},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.2994999885559082},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.2913999855518341},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.27720001339912415},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2727999985218048},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.26269999146461487}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.27235","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27235","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.27235","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.27235","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5555627942085266}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Layered":[0],"image":[1,54,182],"generation":[2,55,108,151],"and":[3,14,56,69,92,109,127,205],"editing":[4,23],"is":[5],"a":[6,44,95,176],"fundamental":[7],"capability":[8],"that":[9,123,159],"enables":[10,105],"layer-wise":[11,107],"reuse,":[12],"editing,":[13,57],"composition":[15],"of":[16],"generated":[17],"visual":[18],"content,":[19],"analogous":[20],"to":[21,112,146,197],"word-level":[22],"in":[24,193],"natural":[25],"language.":[26],"Despite":[27],"its":[28],"importance,":[29],"this":[30,39,75],"remains":[31],"an":[32,119],"underexplored":[33],"area":[34],"at":[35],"scale.":[36],"To":[37,72],"address":[38],"gap,":[40],"we":[41,77,84,117,142],"present":[42],"MRT,":[43],"20B-parameter":[45],"masked":[46,97],"region":[47,98],"diffusion":[48,99,144],"model":[49,186,192],"tailored":[50],"for":[51,179],"multi-layer":[52,150,180],"transparent":[53,181],"trained":[58],"on":[59],"over":[60],"10M":[61],"multilingual":[62],"design":[63],"samples":[64],"spanning":[65],"diverse":[66],"aspect":[67],"ratios":[68],"textual":[70],"prompts.":[71],"fully":[73],"leverage":[74],"scale,":[76],"make":[78],"two":[79],"key":[80],"technical":[81],"contributions.":[82],"First,":[83],"unify":[85],"three":[86,173],"complementary":[87],"tasks":[88],"including":[89,167],"text-to-layers,":[90],"image-to-layers,":[91],"layers-to-layers":[93],"within":[94],"shared":[96],"framework,":[100],"where":[101],"selective":[102],"token":[103],"masking":[104],"flexible":[106],"editing.":[110],"Second,":[111],"enable":[113],"overflow":[114],"layer":[115,122],"generation,":[116],"introduce":[118],"overflow-aware":[120],"canvas":[121,139],"handles":[124],"boundary":[125],"inconsistencies":[126],"supports":[128],"semi-transparent":[129],"background":[130],"synthesis,":[131],"enabling":[132],"complete":[133],"editable":[134],"layers":[135],"extending":[136],"beyond":[137],"visible":[138],"boundaries.":[140],"Additionally,":[141],"apply":[143],"distillation":[145],"achieve":[147],"8-step,":[148],"real-time":[149],"with":[152],"minimal":[153],"quality":[154,195],"degradation.":[155],"Extensive":[156],"experiments":[157],"demonstrate":[158],"our":[160,185],"framework":[161],"substantially":[162],"outperforms":[163,188],"prior":[164],"state-of-the-art":[165],"approaches,":[166],"various":[168],"commercial":[169],"systems,":[170],"across":[171],"all":[172],"tasks,":[174],"establishing":[175],"new":[177],"benchmark":[178],"generation.":[183],"Notably,":[184],"significantly":[187],"the":[189],"concurrent":[190],"Qwen-Image-Layered":[191],"image-to-layers":[194],"according":[196],"user-study":[198],"results,":[199],"while":[200],"achieving":[201],"10-100\\times":[202],"faster":[203],"inference":[204],"reducing":[206],"activation":[207],"GPU":[208],"memory":[209],"consumption":[210],"by":[211],"50-90\\%":[212],"during":[213],"image-to-layer":[214],"inference.":[215]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-28T00:00:00"}
