{"id":"https://openalex.org/W7137990175","doi":"https://doi.org/10.1609/aaai.v40i7.37508","title":"Towards Spatially Consistent Image Generation: On Incorporating Intrinsic Scene Properties into Diffusion Models","display_name":"Towards Spatially Consistent Image Generation: On Incorporating Intrinsic Scene Properties into Diffusion Models","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7137990175","doi":"https://doi.org/10.1609/aaai.v40i7.37508"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i7.37508","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i7.37508","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i7.37508","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5048168648","display_name":"Hyundo Lee","orcid":"https://orcid.org/0009-0009-1184-9820"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hyundo Lee","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102295825","display_name":"Suhyung Choi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Suhyung Choi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129679262","display_name":"Inwoo Hwang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Inwoo Hwang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129744641","display_name":"Byoung-Tak Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Byoung-Tak Zhang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5048168648"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.2639821,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"7","first_page":"5863","last_page":"5871"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9610000252723694,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9610000252723694,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10719","display_name":"3D Shape Modeling and Analysis","score":0.006000000052154064,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10481","display_name":"Computer Graphics and Visualization Techniques","score":0.004100000020116568,"subfield":{"id":"https://openalex.org/subfields/1704","display_name":"Computer Graphics and Computer-Aided Design"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/intrinsics","display_name":"Intrinsics","score":0.6840999722480774},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5709999799728394},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5105999708175659},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.44119998812675476},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.4050000011920929},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3968999981880188},{"id":"https://openalex.org/keywords/spatial-analysis","display_name":"Spatial analysis","score":0.38760000467300415},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.38100001215934753},{"id":"https://openalex.org/keywords/categorical-variable","display_name":"Categorical variable","score":0.3716000020503998}],"concepts":[{"id":"https://openalex.org/C2908650547","wikidata":"https://www.wikidata.org/wiki/Q20999234","display_name":"Intrinsics","level":2,"score":0.6840999722480774},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6488000154495239},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6395000219345093},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5709999799728394},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5539000034332275},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5105999708175659},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.44119998812675476},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.4050000011920929},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3968999981880188},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.38760000467300415},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.38100001215934753},{"id":"https://openalex.org/C5274069","wikidata":"https://www.wikidata.org/wiki/Q2285707","display_name":"Categorical variable","level":2,"score":0.3716000020503998},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.3271999955177307},{"id":"https://openalex.org/C203504353","wikidata":"https://www.wikidata.org/wiki/Q4765461","display_name":"Anisotropic diffusion","level":3,"score":0.31709998846054077},{"id":"https://openalex.org/C83248878","wikidata":"https://www.wikidata.org/wiki/Q344000","display_name":"Active appearance model","level":3,"score":0.30889999866485596},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.3037000000476837},{"id":"https://openalex.org/C152139883","wikidata":"https://www.wikidata.org/wiki/Q252973","display_name":"Mutual information","level":2,"score":0.3000999987125397},{"id":"https://openalex.org/C152565575","wikidata":"https://www.wikidata.org/wiki/Q1124538","display_name":"Conditional random field","level":2,"score":0.289900004863739},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.28279998898506165},{"id":"https://openalex.org/C120515352","wikidata":"https://www.wikidata.org/wiki/Q2564580","display_name":"Image plane","level":3,"score":0.27619999647140503},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.26919999718666077},{"id":"https://openalex.org/C60008888","wikidata":"https://www.wikidata.org/wiki/Q6031013","display_name":"Information bottleneck method","level":3,"score":0.259799987077713},{"id":"https://openalex.org/C53533937","wikidata":"https://www.wikidata.org/wiki/Q185020","display_name":"Histogram","level":3,"score":0.25839999318122864},{"id":"https://openalex.org/C197654239","wikidata":"https://www.wikidata.org/wiki/Q7430757","display_name":"Scene statistics","level":3,"score":0.25290000438690186}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i7.37508","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i7.37508","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i7.37508","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i7.37508","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Image":[0],"generation":[1],"models":[2],"trained":[3],"on":[4,56],"large":[5,104],"datasets":[6],"can":[7],"synthesize":[8],"high-quality":[9],"images":[10,18,71],"but":[11],"often":[12],"produce":[13],"spatially":[14,89],"inconsistent":[15],"and":[16,27,72,86,91,150,162,180,192],"distorted":[17],"due":[19],"to":[20,68,79],"limited":[21],"information":[22,45,116,157],"about":[23,46],"the":[24,47,77,82,111,148,160,190,196],"underlying":[25,48,83],"structures":[26],"spatial":[28,178],"layouts.":[29],"In":[30],"this":[31],"work,":[32],"we":[33,95],"leverage":[34],"intrinsic":[35,99,125,151,163],"scene":[36,84,100,115,126],"properties":[37,101,127],"(e.g.,":[38,199],"depth,":[39],"segmentation":[40],"maps)":[41],"that":[42,53,159,174],"provide":[43],"rich":[44,98],"scene,":[49],"unlike":[50],"prior":[51],"approaches":[52],"solely":[54],"rely":[55],"image-text":[57],"pairs":[58],"or":[59,117],"use":[60],"intrinsics":[61],"as":[62],"conditional":[63],"inputs.":[64],"Our":[65],"approach":[66],"aims":[67],"co-generate":[69],"both":[70],"their":[73],"corresponding":[74],"intrinsics,":[75],"enabling":[76],"model":[78,198],"implicitly":[80],"capture":[81],"structure":[85],"generate":[87],"more":[88,183],"consistent":[90],"realistic":[92],"images.":[93],"Specifically,":[94],"first":[96],"extract":[97],"from":[102],"a":[103,129,182],"image":[105,149,161,169],"dataset":[106],"with":[107],"pre-trained":[108,138],"estimators,":[109],"eliminating":[110],"need":[112],"for":[113],"additional":[114],"explicit":[118],"3D":[119],"representations.":[120],"We":[121],"then":[122],"aggregate":[123],"various":[124],"into":[128],"single":[130],"latent":[131],"variable":[132],"using":[133],"an":[134],"autoencoder.":[135],"Building":[136],"upon":[137],"large-scale":[139],"Latent":[140],"Diffusion":[141],"Models":[142],"(LDMs),":[143],"our":[144,175],"method":[145,176],"simultaneously":[146],"denoises":[147],"domains":[152],"by":[153],"carefully":[154],"sharing":[155],"mutual":[156],"so":[158],"reflect":[164],"each":[165],"other":[166],"without":[167],"degrading":[168],"quality.":[170],"Experimental":[171],"results":[172],"demonstrate":[173],"corrects":[177],"inconsistencies":[179],"produces":[181],"natural":[184],"layout":[185],"of":[186,195],"scenes":[187],"while":[188],"maintaining":[189],"fidelity":[191],"textual":[193],"alignment":[194],"base":[197],"Stable":[200],"Diffusion).":[201]},"counts_by_year":[],"updated_date":"2026-03-18T06:31:55.123368","created_date":"2026-03-18T00:00:00"}
