{"id":"https://openalex.org/W4413319048","doi":"https://doi.org/10.1109/tmm.2025.3599077","title":"Semantic-Spatial Attention for Refined Object Placement in Text-to-Image Synthesis","display_name":"Semantic-Spatial Attention for Refined Object Placement in Text-to-Image Synthesis","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4413319048","doi":"https://doi.org/10.1109/tmm.2025.3599077"},"language":"en","primary_location":{"id":"doi:10.1109/tmm.2025.3599077","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3599077","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101410889","display_name":"Jianwei Zheng","orcid":"https://orcid.org/0000-0002-1896-7135"},"institutions":[{"id":"https://openalex.org/I55712492","display_name":"Zhejiang University of Technology","ror":"https://ror.org/02djqfd08","country_code":"CN","type":"education","lineage":["https://openalex.org/I55712492"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jianwei Zheng","raw_affiliation_strings":["College of Computer Science and Technology, Zhejiang University of Technology, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, Zhejiang University of Technology, Hangzhou, China","institution_ids":["https://openalex.org/I55712492"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069785955","display_name":"Ni Xu","orcid":"https://orcid.org/0000-0002-2507-1708"},"institutions":[{"id":"https://openalex.org/I55712492","display_name":"Zhejiang University of Technology","ror":"https://ror.org/02djqfd08","country_code":"CN","type":"education","lineage":["https://openalex.org/I55712492"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ni Xu","raw_affiliation_strings":["College of Computer Science and Technology, Zhejiang University of Technology, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, Zhejiang University of Technology, Hangzhou, China","institution_ids":["https://openalex.org/I55712492"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100739392","display_name":"Wei Li","orcid":"https://orcid.org/0000-0002-4308-4385"},"institutions":[{"id":"https://openalex.org/I55712492","display_name":"Zhejiang University of Technology","ror":"https://ror.org/02djqfd08","country_code":"CN","type":"education","lineage":["https://openalex.org/I55712492"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Li","raw_affiliation_strings":["College of Computer Science and Technology, Zhejiang University of Technology, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, Zhejiang University of Technology, Hangzhou, China","institution_ids":["https://openalex.org/I55712492"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006540906","display_name":"Jiawei Jiang","orcid":null},"institutions":[{"id":"https://openalex.org/I55712492","display_name":"Zhejiang University of Technology","ror":"https://ror.org/02djqfd08","country_code":"CN","type":"education","lineage":["https://openalex.org/I55712492"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiawei Jiang","raw_affiliation_strings":["College of Computer Science and Technology, Zhejiang University of Technology, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, Zhejiang University of Technology, Hangzhou, China","institution_ids":["https://openalex.org/I55712492"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100699785","display_name":"Xiaoqin Zhang","orcid":"https://orcid.org/0000-0003-0958-7285"},"institutions":[{"id":"https://openalex.org/I55712492","display_name":"Zhejiang University of Technology","ror":"https://ror.org/02djqfd08","country_code":"CN","type":"education","lineage":["https://openalex.org/I55712492"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoqin Zhang","raw_affiliation_strings":["College of Computer Science and Technology, Zhejiang University of Technology, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"College of Computer Science and Technology, Zhejiang University of Technology, Hangzhou, China","institution_ids":["https://openalex.org/I55712492"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101410889"],"corresponding_institution_ids":["https://openalex.org/I55712492"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.25481559,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"27","issue":null,"first_page":"7255","last_page":"7270"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9868000149726868,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9868000149726868,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9815000295639038,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9787999987602234,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8650268912315369},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5448753833770752},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.5134854912757874},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4919266700744629},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.48150405287742615},{"id":"https://openalex.org/keywords/image-synthesis","display_name":"Image synthesis","score":0.4620780944824219},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.37488436698913574},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.34525036811828613},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.3279324769973755}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8650268912315369},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5448753833770752},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5134854912757874},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4919266700744629},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.48150405287742615},{"id":"https://openalex.org/C2989087649","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Image synthesis","level":3,"score":0.4620780944824219},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.37488436698913574},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.34525036811828613},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.3279324769973755}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2025.3599077","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3599077","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W1773149199","https://openalex.org/W1861492603","https://openalex.org/W1901129140","https://openalex.org/W2185175083","https://openalex.org/W2765811365","https://openalex.org/W2963163163","https://openalex.org/W2964024144","https://openalex.org/W3133431537","https://openalex.org/W3174525637","https://openalex.org/W3191805365","https://openalex.org/W4312531268","https://openalex.org/W4312740349","https://openalex.org/W4312911498","https://openalex.org/W4312933868","https://openalex.org/W4320008790","https://openalex.org/W4385270985","https://openalex.org/W4385271281","https://openalex.org/W4385527149","https://openalex.org/W4385801729","https://openalex.org/W4386065257","https://openalex.org/W4386075631","https://openalex.org/W4386076027","https://openalex.org/W4386113271","https://openalex.org/W4390872387","https://openalex.org/W4390872671","https://openalex.org/W4390873054","https://openalex.org/W4390873281","https://openalex.org/W4390889801","https://openalex.org/W4393148753","https://openalex.org/W4394625750","https://openalex.org/W4396817358","https://openalex.org/W4400410743","https://openalex.org/W4402727702","https://openalex.org/W4402733577"],"related_works":["https://openalex.org/W2772330423","https://openalex.org/W4283758926","https://openalex.org/W1503502830","https://openalex.org/W3020852381","https://openalex.org/W4308217387","https://openalex.org/W2981017910","https://openalex.org/W3002350530","https://openalex.org/W2770776392","https://openalex.org/W3176454756","https://openalex.org/W3163523050"],"abstract_inverted_index":{"Solely":[0],"based":[1],"on":[2],"given":[3],"prompts,":[4],"text-guided":[5],"diffusion":[6,134],"models":[7],"have":[8],"enjoyed":[9],"a":[10,28,59,102,148,198],"unique":[11],"capability":[12],"in":[13,33,39,47,147,185],"generating":[14],"diverse":[15],"and":[16,65,85,119,124,183,188],"creative":[17],"images.":[18,41],"Nevertheless,":[19],"the":[20,35,56,73,82,89,128,143,165,173,194],"conveyance":[21],"of":[22,30,37,44,58,62,88,131,139,167,175,190],"image":[23],"information":[24],"through":[25],"text":[26],"presents":[27],"series":[29],"challenges,":[31],"particularly":[32],"controlling":[34],"positioning":[36,169],"objects":[38],"synthesized":[40],"Despite":[42],"attempts":[43],"recent":[45],"efforts":[46],"exploring":[48],"alternative":[49],"conditions,":[50],"such":[51,179],"as":[52,69,180],"bounding":[53],"box/mask-image":[54],"pairs,":[55],"requirement":[57],"substantial":[60],"amount":[61],"paired":[63],"data":[64,141],"time-consuming":[66],"fine-tuning":[67],"emerge":[68],"new":[70,154],"issues.":[71],"Given":[72],"observations":[74],"that":[75,158],"not":[76,161],"only":[77,162],"prompt-related":[78],"cross-attention":[79],"maps":[80],"reveal":[81],"spatial":[83,108],"arrangement":[84],"centroid":[86,120],"positions":[87],"objects,":[90],"but":[91,170],"also":[92,171],"out-of-prompt":[93],"markers":[94],"enjoy":[95],"rich":[96],"semantic":[97],"information,":[98],"we":[99],"thus":[100],"engineer":[101],"weighted":[103],"optimization":[104],"loss.":[105],"Specifically,":[106],"three":[107],"sub-losses,":[109],"namely":[110],"inner":[111],"box":[112,116],"reinforcement":[113],"loss,":[114,118,121],"outer":[115],"attenuation":[117],"are":[122],"devised":[123],"seamlessly":[125],"integrated":[126],"into":[127],"sampling":[129],"step":[130],"current":[132,177],"vanilla":[133],"models.":[135],"Without":[136],"any":[137],"annotations":[138],"layout":[140],"required,":[142],"final":[144],"approach":[145],"runs":[146],"training-free":[149],"fashion.":[150],"Extensive":[151],"experiments":[152],"with":[153],"performance":[155],"scores":[156],"demonstrate":[157],"our":[159],"proposal":[160],"successfully":[163],"addresses":[164],"issue":[166],"object":[168],"boosts":[172],"capabilities":[174],"most":[176],"models,":[178],"Stable":[181],"Diffusion":[182],"GLIGEN,":[184],"high-quality":[186],"synthesis":[187],"coverage":[189],"various":[191],"concepts.":[192],"Moreover,":[193],"proposed":[195],"mechanism":[196],"plays":[197],"plug-and-play":[199],"role.":[200]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
