{"id":"https://openalex.org/W7137806507","doi":"https://doi.org/10.1609/aaai.v40i10.37722","title":"UM-Text: A Unified Multimodal Model for Image Understanding and Visual Text Editing","display_name":"UM-Text: A Unified Multimodal Model for Image Understanding and Visual Text Editing","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7137806507","doi":"https://doi.org/10.1609/aaai.v40i10.37722"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i10.37722","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i10.37722","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i10.37722","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Lichen Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Lichen Ma","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Xiaolong Fu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaolong Fu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Gaojing Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gaojing Zhou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zipeng Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zipeng Guo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Ting Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ting Zhu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yichun Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yichun Liu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yu Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu Shi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Jason Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jason Li","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Junshi Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Junshi Huang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.02081599,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"10","first_page":"7791","last_page":"7799"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5516999959945679,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5516999959945679,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.262800008058548,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.017500000074505806,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6014999747276306},{"id":"https://openalex.org/keywords/glyph","display_name":"Glyph (data visualization)","score":0.5917999744415283},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.551800012588501},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.5496000051498413},{"id":"https://openalex.org/keywords/image-editing","display_name":"Image editing","score":0.5144000053405762},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4966000020503998},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4334000051021576},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.41670000553131104},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.3926999866962433}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8511999845504761},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6014999747276306},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5982999801635742},{"id":"https://openalex.org/C142816647","wikidata":"https://www.wikidata.org/wiki/Q5573018","display_name":"Glyph (data visualization)","level":3,"score":0.5917999744415283},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5838000178337097},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.551800012588501},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.5496000051498413},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.5144000053405762},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4966000020503998},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4334000051021576},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.41670000553131104},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.3926999866962433},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.38119998574256897},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.37950000166893005},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.34769999980926514},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.3398999869823456},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.3118000030517578},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3084000051021576},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.30640000104904175},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.3037000000476837},{"id":"https://openalex.org/C2777737414","wikidata":"https://www.wikidata.org/wiki/Q4868296","display_name":"Font","level":2,"score":0.2924000024795532},{"id":"https://openalex.org/C2985684807","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Text generation","level":2,"score":0.287200003862381},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2808000147342682},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.2703999876976013},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2662999927997589},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.2565999925136566},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.25600001215934753},{"id":"https://openalex.org/C2780980858","wikidata":"https://www.wikidata.org/wiki/Q110022","display_name":"Dual (grammatical number)","level":2,"score":0.2551000118255615},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.2549999952316284},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2500999867916107}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i10.37722","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i10.37722","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i10.37722","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i10.37722","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.7754055857658386}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"With":[0],"the":[1,28,42,52,66,70,104,111,122,137,142,149,158,199],"rapid":[2],"advancement":[3],"of":[4,21,50,144],"image":[5,205],"generation,":[6],"visual":[7,36,87,131,203],"text":[8,37,53,88,112,132,204],"editing":[9,89],"using":[10],"natural":[11,91],"language":[12,92],"instructions":[13],"has":[14],"received":[15],"increasing":[16],"attention.":[17],"The":[18],"main":[19],"challenge":[20],"this":[22],"task":[23],"is":[24,39,151],"to":[25,102,121,140,157,169,190],"fully":[26],"understand":[27],"instruction":[29,105],"and":[30,33,55,62,86,106,114,129,180,183,215],"reference":[31,71,107],"image,":[32,108,133],"thus":[34],"generate":[35,126],"that":[38,110,223],"style-consistent":[40],"with":[41,69],"image.":[43,72],"Previous":[44],"methods":[45],"often":[46],"involve":[47],"complex":[48],"steps":[49],"specifying":[51],"content":[54,113],"attributes,":[56],"such":[57],"as":[58],"font":[59],"size,":[60],"color,":[61],"layout,":[63],"without":[64],"considering":[65],"stylistic":[67],"consistency":[68,167],"To":[73,125],"address":[74],"this,":[75],"we":[76,95,134,163,197],"propose":[77,136,164],"UM-Text,":[78],"a":[79,97,165,185,201],"unified":[80],"multimodal":[81],"model":[82,193,211],"for":[83,174,210],"context":[84,123],"understanding":[85],"by":[90,154],"instructions.":[93],"Specifically,":[94],"introduce":[96],"Visual":[98],"Language":[99],"Model":[100],"(VLM)":[101],"process":[103],"so":[109],"layout":[115],"can":[116],"be":[117],"elaborately":[118],"designed":[119],"according":[120,156],"information.":[124],"an":[127],"accurate":[128],"harmonious":[130],"further":[135,191],"UM":[138],"Encoder":[139],"combine":[141],"embeddings":[143],"various":[145],"condition":[146],"information,":[147],"where":[148],"combination":[150],"automatically":[152],"configured":[153],"VLM":[155],"input":[159],"instruction.":[160],"During":[161],"training,":[162],"regional":[166],"loss":[168],"offer":[170],"more":[171],"effective":[172],"supervision":[173],"glyph":[175],"generation":[176],"on":[177,207,218],"both":[178],"latent":[179],"RGB":[181],"space,":[182],"design":[184],"tailored":[186],"three-stage":[187],"training":[188],"strategy":[189],"enhance":[192],"performance.":[194,228],"In":[195],"addition,":[196],"contribute":[198],"UM-DATA-200K,":[200],"large-scale":[202],"dataset":[206],"diverse":[208],"scenes":[209],"training.":[212],"Extensive":[213],"qualitative":[214],"quantitative":[216],"results":[217],"multiple":[219],"public":[220],"benchmarks":[221],"demonstrate":[222],"our":[224],"method":[225],"achieves":[226],"state-of-the-art":[227]},"counts_by_year":[],"updated_date":"2026-03-18T14:38:29.013473","created_date":"2026-02-06T00:00:00"}
