{"id":"https://openalex.org/W7160921957","doi":"https://doi.org/10.48550/arxiv.2605.08129","title":"Towards Customized Multimodal Role-Play","display_name":"Towards Customized Multimodal Role-Play","publication_year":2026,"publication_date":"2026-05-01","ids":{"openalex":"https://openalex.org/W7160921957","doi":"https://doi.org/10.48550/arxiv.2605.08129"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.08129","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08129","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.08129","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135932564","display_name":"Chao Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Chao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024765927","display_name":"Jianzong Wu","orcid":"https://orcid.org/0009-0007-4559-7970"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Jianzong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135970808","display_name":"Qingyu Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Qingyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135946539","display_name":"Ye Tian","orcid":"https://orcid.org/0000-0002-2623-9293"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Ye","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135964845","display_name":"Aixi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Aixi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135945900","display_name":"Hao Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Hao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135978129","display_name":"Jiangning Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jiangning","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5024097240","display_name":"Yunhai Tong","orcid":"https://orcid.org/0000-0001-8735-2516"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tong, Yunhai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9175000190734863,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9175000190734863,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.033799998462200165,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11574","display_name":"Artificial Intelligence in Games","score":0.01140000019222498,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.7646999955177307},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.6310999989509583},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.585099995136261},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5508000254631042},{"id":"https://openalex.org/keywords/personalization","display_name":"Personalization","score":0.5486000180244446},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.46380001306533813},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.4056999981403351},{"id":"https://openalex.org/keywords/cover","display_name":"Cover (algebra)","score":0.3970000147819519}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.782800018787384},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.7646999955177307},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.6310999989509583},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.585099995136261},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5508000254631042},{"id":"https://openalex.org/C183003079","wikidata":"https://www.wikidata.org/wiki/Q1000371","display_name":"Personalization","level":2,"score":0.5486000180244446},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4810999929904938},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.46380001306533813},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4226999878883362},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.4056999981403351},{"id":"https://openalex.org/C2780428219","wikidata":"https://www.wikidata.org/wiki/Q16952335","display_name":"Cover (algebra)","level":2,"score":0.3970000147819519},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.3953999876976013},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.38989999890327454},{"id":"https://openalex.org/C135641252","wikidata":"https://www.wikidata.org/wiki/Q738567","display_name":"Multimodal interaction","level":2,"score":0.351500004529953},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.32659998536109924},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.2971999943256378},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.29679998755455017},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.29109999537467957},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.288100004196167},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2879999876022339},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.28189998865127563},{"id":"https://openalex.org/C45493050","wikidata":"https://www.wikidata.org/wiki/Q7884934","display_name":"Unified Model","level":2,"score":0.26809999346733093},{"id":"https://openalex.org/C34972735","wikidata":"https://www.wikidata.org/wiki/Q2920267","display_name":"Engineering design process","level":2,"score":0.2572000026702881}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.08129","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08129","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.08129","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08129","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Unified":[0,79],"multimodal":[1],"understanding":[2],"and":[3,18,53,63,83,104,109,116,150,168],"generation":[4],"models":[5],"enable":[6],"richer":[7],"human-AI":[8],"interaction.":[9],"Yet":[10],"jointly":[11],"customizing":[12],"a":[13,36,68,74,163],"character's":[14],"persona,":[15,58,107],"dialogue":[16],"style,":[17,108],"visual":[19,110],"identity":[20,111],"while":[21],"maintaining":[22],"output":[23],"consistency":[24,148],"across":[25],"modalities":[26],"remains":[27],"largely":[28],"unexplored.":[29],"To":[30],"mitigate":[31],"this":[32],"gap,":[33],"we":[34,71],"introduce":[35],"new":[37],"task,":[38],"Customized":[39],"Multimodal":[40],"Role-Play":[41],"(CMRP).":[42],"We":[43,154],"construct":[44],"the":[45,98,101,127,132,143],"RoleScape-20":[46,128],"dataset":[47,129],"comprising":[48],"20":[49],"characters,":[50],"including":[51],"training":[52,76],"evaluation":[54],"data":[55],"that":[56,131,156],"cover":[57],"stylistic":[59],"descriptions,":[60],"visual/expressive":[61],"cues,":[62],"text-image":[64],"interactions.":[65],"Building":[66],"on":[67,126],"unified":[69,160],"model,":[70],"devise":[72],"UniCharacter,":[73],"two-stage":[75],"framework":[77],"containing":[78],"Supervised":[80],"Finetuning":[81],"(Unified-SFT)":[82],"character-specific":[84],"group":[85],"relative":[86],"policy":[87],"optimization":[88],"(Character-GRPO).":[89],"Given":[90],"only":[91],"10":[92],"images":[93],"plus":[94],"corresponding":[95],"interaction":[96],"examples,":[97],"model":[99],"acquires":[100],"target":[102],"character":[103],"exhibits":[105],"coherent":[106],"in":[112],"both":[113],"generated":[114],"text":[115],"images.":[117],"This":[118],"process":[119],"takes":[120],"about":[121],"100":[122],"GPU":[123],"hours.":[124],"Experiments":[125],"show":[130],"proposed":[133],"method":[134],"substantially":[135],"outperforms":[136],"prior":[137],"approaches.":[138],"Ablation":[139],"studies":[140],"further":[141],"validate":[142],"effectiveness":[144],"of":[145],"our":[146],"cross-modal":[147],"design":[149],"few-shot":[151],"customization":[152],"strategy.":[153],"argue":[155],"CMRP,":[157],"coupled":[158],"with":[159],"modeling,":[161],"provides":[162],"basis":[164],"for":[165],"next-generation":[166],"characterful":[167],"immersive":[169],"interactive":[170],"agents.":[171]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-13T00:00:00"}
