{"id":"https://openalex.org/W4412567077","doi":"https://doi.org/10.1109/tcsvt.2025.3588882","title":"Beyond Inserting: Learning Subject Embedding for Semantic-Fidelity Personalized Diffusion Generation","display_name":"Beyond Inserting: Learning Subject Embedding for Semantic-Fidelity Personalized Diffusion Generation","publication_year":2025,"publication_date":"2025-07-22","ids":{"openalex":"https://openalex.org/W4412567077","doi":"https://doi.org/10.1109/tcsvt.2025.3588882"},"language":"en","primary_location":{"id":"doi:10.1109/tcsvt.2025.3588882","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2025.3588882","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yang Li","orcid":"https://orcid.org/0009-0001-2364-3346"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yang Li","raw_affiliation_strings":["New Laboratory of Pattern Recognition (NLPR), the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","Institute of Automation, New Laboratory of Pattern Recognition (NLPR) and the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Chinese Academy of Sciences (CASIA), Beijing, China"],"raw_orcid":"https://orcid.org/0009-0001-2364-3346","affiliations":[{"raw_affiliation_string":"New Laboratory of Pattern Recognition (NLPR), the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150"]},{"raw_affiliation_string":"Institute of Automation, New Laboratory of Pattern Recognition (NLPR) and the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100541272","display_name":"Songlin Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Songlin Yang","raw_affiliation_strings":["New Laboratory of Pattern Recognition (NLPR), the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","Institute of Automation, New Laboratory of Pattern Recognition (NLPR) and the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Chinese Academy of Sciences (CASIA), Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"New Laboratory of Pattern Recognition (NLPR), the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150"]},{"raw_affiliation_string":"Institute of Automation, New Laboratory of Pattern Recognition (NLPR) and the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100757829","display_name":"Wei Wang","orcid":"https://orcid.org/0000-0002-8598-0831"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Wang","raw_affiliation_strings":["New Laboratory of Pattern Recognition (NLPR), the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","Institute of Automation, New Laboratory of Pattern Recognition (NLPR) and the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Chinese Academy of Sciences (CASIA), Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-8598-0831","affiliations":[{"raw_affiliation_string":"New Laboratory of Pattern Recognition (NLPR), the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150"]},{"raw_affiliation_string":"Institute of Automation, New Laboratory of Pattern Recognition (NLPR) and the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017743261","display_name":"Jing Dong","orcid":"https://orcid.org/0000-0002-2763-7832"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jing Dong","raw_affiliation_strings":["New Laboratory of Pattern Recognition (NLPR), the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","Institute of Automation, New Laboratory of Pattern Recognition (NLPR) and the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Chinese Academy of Sciences (CASIA), Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-2763-7832","affiliations":[{"raw_affiliation_string":"New Laboratory of Pattern Recognition (NLPR), the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150"]},{"raw_affiliation_string":"Institute of Automation, New Laboratory of Pattern Recognition (NLPR) and the State Key Laboratory of Multimodal Artificial Intelligence Systems (MAIS), Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I4210112150"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.08438631,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"35","issue":"12","first_page":"12607","last_page":"12621"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.933899998664856,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.933899998664856,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7481936812400818},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6945695877075195},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.5839139223098755},{"id":"https://openalex.org/keywords/subject","display_name":"Subject (documents)","score":0.5585440397262573},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.483221173286438},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3584522008895874},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3298690617084503},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.3237431049346924},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.2799268960952759}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7481936812400818},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6945695877075195},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.5839139223098755},{"id":"https://openalex.org/C2777855551","wikidata":"https://www.wikidata.org/wiki/Q12310021","display_name":"Subject (documents)","level":2,"score":0.5585440397262573},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.483221173286438},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3584522008895874},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3298690617084503},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3237431049346924},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.2799268960952759},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tcsvt.2025.3588882","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2025.3588882","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4699999988079071,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"awards":[{"id":"https://openalex.org/G3451019941","display_name":null,"funder_award_id":"62372452","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":40,"referenced_works":["https://openalex.org/W2962770929","https://openalex.org/W2969985801","https://openalex.org/W3035574324","https://openalex.org/W3159481202","https://openalex.org/W3180355996","https://openalex.org/W3200571990","https://openalex.org/W3216352822","https://openalex.org/W4205219932","https://openalex.org/W4214926101","https://openalex.org/W4285124635","https://openalex.org/W4286611278","https://openalex.org/W4304098884","https://openalex.org/W4311415873","https://openalex.org/W4312740349","https://openalex.org/W4312933868","https://openalex.org/W4375850648","https://openalex.org/W4380905910","https://openalex.org/W4385245566","https://openalex.org/W4385271055","https://openalex.org/W4385527149","https://openalex.org/W4385535331","https://openalex.org/W4385801739","https://openalex.org/W4386072096","https://openalex.org/W4386076425","https://openalex.org/W4386076532","https://openalex.org/W4386083141","https://openalex.org/W4387968060","https://openalex.org/W4389334989","https://openalex.org/W4389539268","https://openalex.org/W4389539696","https://openalex.org/W4390357625","https://openalex.org/W4390489032","https://openalex.org/W4390534338","https://openalex.org/W4390871782","https://openalex.org/W4390874393","https://openalex.org/W4392151693","https://openalex.org/W4393159706","https://openalex.org/W4393159920","https://openalex.org/W4402704511","https://openalex.org/W4402979349"],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W4388258507","https://openalex.org/W2381850946","https://openalex.org/W4380449851","https://openalex.org/W2392013855","https://openalex.org/W4318064328","https://openalex.org/W2081900870","https://openalex.org/W2357926602","https://openalex.org/W3125091513","https://openalex.org/W2932872266"],"abstract_inverted_index":{"Text-to-Image":[0],"(T2I)":[1],"personalization":[2,85],"based":[3],"on":[4,146,239,267],"advanced":[5],"diffusion":[6],"models":[7,45,60],"(e.g.,":[8],"Stable":[9,155],"Diffusion),":[10],"which":[11],"aims":[12],"to":[13,47,89,114,186,246],"generate":[14,49,115],"images":[15,81,119],"of":[16,223,264],"target":[17,69,79],"subjects":[18,35,70],"given":[19,122],"various":[20,268],"prompts,":[21],"has":[22],"drawn":[23],"huge":[24],"attention.":[25],"However,":[26],"when":[27],"users":[28],"require":[29],"personalized":[30,160],"image":[31],"generation":[32,161],"for":[33,120,158],"specific":[34],"such":[36,127],"as":[37,128,210],"themselves":[38],"or":[39,95],"their":[40,50,72],"pet":[41],"cat,":[42],"the":[43,64,68,92,97,121,154,188,224,229,240,262],"T2I":[44,59,65,106],"fail":[46],"accurately":[48,90],"subject-preserved":[51],"images.":[52],"The":[53],"main":[54],"problem":[55],"is":[56],"that":[57,248],"pre-trained":[58],"do":[61],"not":[62],"learn":[63],"mapping":[66],"between":[67],"and":[71,117,136,149,175,199,214,255],"corresponding":[73],"visual":[74],"contents.":[75],"Even":[76],"if":[77],"multiple":[78,211],"subject":[80,93,151,189,196,208,253],"are":[82,112],"provided,":[83],"previous":[84],"methods":[86,266],"either":[87],"failed":[88],"fit":[91],"region":[94],"lost":[96],"interactive":[98,150,201],"generative":[99,202],"ability":[100],"with":[101,124,194],"other":[102,125],"existing":[103],"concepts":[104,126],"in":[105],"model":[107],"space.":[108],"For":[109],"example,":[110],"they":[111],"unable":[113],"T2I-aligned":[116],"semantic-fidelity":[118,159,176],"prompts":[123],"scenes":[129],"(\u201cEiffel":[130],"Tower\u201d),":[131],"actions":[132],"(\u201cholding":[133],"a":[134,182,192],"basketball\u201d),":[135],"facial":[137],"attributes":[138],"(\u201ceyes":[139],"closed\u201d).":[140],"In":[141],"this":[142,167],"paper,":[143],"we":[144,180,205],"focus":[145],"inserting":[147],"accurate":[148],"embedding":[152,190],"into":[153],"Diffusion":[156],"Model":[157],"using":[162],"one":[163,207],"image.":[164],"We":[165,235,259],"address":[166],"challenge":[168],"from":[169],"two":[170,218],"perspectives:":[171],"subject-wise":[172,183],"attention":[173,184],"loss":[174,185],"token":[177,216],"optimization.":[178],"Specifically,":[179],"propose":[181],"guide":[187],"onto":[191],"manifold":[193],"high":[195],"identity":[197],"similarity":[198],"diverse":[200],"ability.":[203,258],"Then,":[204],"optimize":[206],"representation":[209],"per-stage":[212],"tokens,":[213],"each":[215],"contains":[217],"disentangled":[219],"features.":[220],"This":[221],"expansion":[222],"textual":[225],"conditioning":[226],"space":[227],"enhances":[228],"semantic":[230],"control,":[231],"thereby":[232],"improving":[233],"semantic-fidelity.":[234],"conduct":[236],"extensive":[237],"experiments":[238],"most":[241],"challenging":[242],"subjects,":[243],"face":[244],"identities,":[245],"validate":[247,261],"our":[249,265],"results":[250],"exhibit":[251],"superior":[252],"accuracy":[254],"fine-grained":[256],"manipulation":[257],"further":[260],"generalization":[263],"non-face":[269],"subjects.":[270]},"counts_by_year":[],"updated_date":"2025-12-06T23:10:59.065948","created_date":"2025-10-10T00:00:00"}
