{"id":"https://openalex.org/W4403791999","doi":"https://doi.org/10.1145/3664647.3681284","title":"GOAL: Grounded text-to-image Synthesis with Joint Layout Alignment Tuning","display_name":"GOAL: Grounded text-to-image Synthesis with Joint Layout Alignment Tuning","publication_year":2024,"publication_date":"2024-10-26","ids":{"openalex":"https://openalex.org/W4403791999","doi":"https://doi.org/10.1145/3664647.3681284"},"language":"en","primary_location":{"id":"doi:10.1145/3664647.3681284","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3681284","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yaqi Li","orcid":"https://orcid.org/0009-0001-2992-4819"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yaqi Li","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0001-2992-4819","affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020428892","display_name":"Han Fang","orcid":"https://orcid.org/0000-0002-4379-2971"},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]},{"id":"https://openalex.org/I4387153335","display_name":"China Telecom","ror":"https://ror.org/05p67dv18","country_code":null,"type":"company","lineage":["https://openalex.org/I4387153335"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Han Fang","raw_affiliation_strings":["China Telecom, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-4379-2971","affiliations":[{"raw_affiliation_string":"China Telecom, Beijing, China","institution_ids":["https://openalex.org/I4210136246","https://openalex.org/I4387153335"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019172074","display_name":"Zerun Feng","orcid":"https://orcid.org/0000-0003-3987-0591"},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]},{"id":"https://openalex.org/I4387153335","display_name":"China Telecom","ror":"https://ror.org/05p67dv18","country_code":null,"type":"company","lineage":["https://openalex.org/I4387153335"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zerun Feng","raw_affiliation_strings":["China Telecom, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-3987-0591","affiliations":[{"raw_affiliation_string":"China Telecom, Beijing, China","institution_ids":["https://openalex.org/I4210136246","https://openalex.org/I4387153335"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104271836","display_name":"Kaijing Ma","orcid":null},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kaijing Ma","raw_affiliation_strings":["Xi'an Jiaotong University, Xi'an, China"],"raw_orcid":"https://orcid.org/0009-0002-9559-0797","affiliations":[{"raw_affiliation_string":"Xi'an Jiaotong University, Xi'an, China","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093130774","display_name":"Chao Ban","orcid":"https://orcid.org/0009-0000-8114-103X"},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]},{"id":"https://openalex.org/I4387153335","display_name":"China Telecom","ror":"https://ror.org/05p67dv18","country_code":null,"type":"company","lineage":["https://openalex.org/I4387153335"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chao Ban","raw_affiliation_strings":["China Telecom, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0000-8114-103X","affiliations":[{"raw_affiliation_string":"China Telecom, Beijing, China","institution_ids":["https://openalex.org/I4210136246","https://openalex.org/I4387153335"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047237045","display_name":"Xianghao Zang","orcid":"https://orcid.org/0000-0001-8421-7167"},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]},{"id":"https://openalex.org/I4387153335","display_name":"China Telecom","ror":"https://ror.org/05p67dv18","country_code":null,"type":"company","lineage":["https://openalex.org/I4387153335"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xianghao Zang","raw_affiliation_strings":["China Telecom, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-8421-7167","affiliations":[{"raw_affiliation_string":"China Telecom, Beijing, China","institution_ids":["https://openalex.org/I4210136246","https://openalex.org/I4387153335"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082212103","display_name":"Lanxiang Zhou","orcid":"https://orcid.org/0009-0009-7003-287X"},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]},{"id":"https://openalex.org/I4387153335","display_name":"China Telecom","ror":"https://ror.org/05p67dv18","country_code":null,"type":"company","lineage":["https://openalex.org/I4387153335"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"LanXiang Zhou","raw_affiliation_strings":["China Telecom, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0009-7003-287X","affiliations":[{"raw_affiliation_string":"China Telecom, Beijing, China","institution_ids":["https://openalex.org/I4210136246","https://openalex.org/I4387153335"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101722779","display_name":"Zhongjiang He","orcid":"https://orcid.org/0009-0000-1835-9271"},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhongjiang He","raw_affiliation_strings":["ChinaTelecom, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0000-1835-9271","affiliations":[{"raw_affiliation_string":"ChinaTelecom, Beijing, China","institution_ids":["https://openalex.org/I4210136246"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jingyan Chen","orcid":"https://orcid.org/0009-0005-1627-4940"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingyan Chen","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0005-1627-4940","affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029551969","display_name":"Jiani Hu","orcid":"https://orcid.org/0009-0001-4407-893X"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiani Hu","raw_affiliation_strings":["Beijing University of Posts and Telecommunications, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0001-4407-893X","affiliations":[{"raw_affiliation_string":"Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063142903","display_name":"Hao Sun","orcid":"https://orcid.org/0009-0007-7917-1628"},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]},{"id":"https://openalex.org/I4387153335","display_name":"China Telecom","ror":"https://ror.org/05p67dv18","country_code":null,"type":"company","lineage":["https://openalex.org/I4387153335"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Sun","raw_affiliation_strings":["China Telecom, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0007-7917-1628","affiliations":[{"raw_affiliation_string":"China Telecom, Beijing, China","institution_ids":["https://openalex.org/I4210136246","https://openalex.org/I4387153335"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5036503240","display_name":"Huayu Zhang","orcid":"https://orcid.org/0000-0001-5663-6639"},"institutions":[{"id":"https://openalex.org/I4210136246","display_name":"China Telecom (China)","ror":"https://ror.org/03jgnzt20","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210136246"]},{"id":"https://openalex.org/I4387153335","display_name":"China Telecom","ror":"https://ror.org/05p67dv18","country_code":null,"type":"company","lineage":["https://openalex.org/I4387153335"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huayu Zhang","raw_affiliation_strings":["China Telecom, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-5663-6639","affiliations":[{"raw_affiliation_string":"China Telecom, Beijing, China","institution_ids":["https://openalex.org/I4210136246","https://openalex.org/I4387153335"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":12,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.2187,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.52180162,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"7055","last_page":"7064"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.995199978351593,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14339","display_name":"Image Processing and 3D Reconstruction","score":0.9855999946594238,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7147268056869507},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.5882886648178101},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.45580366253852844},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.39373189210891724},{"id":"https://openalex.org/keywords/engineering-drawing","display_name":"Engineering drawing","score":0.3871697783470154},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.37071919441223145},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3584747910499573},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3523828387260437},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.13918429613113403}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7147268056869507},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.5882886648178101},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.45580366253852844},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.39373189210891724},{"id":"https://openalex.org/C199639397","wikidata":"https://www.wikidata.org/wiki/Q1788588","display_name":"Engineering drawing","level":1,"score":0.3871697783470154},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.37071919441223145},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3584747910499573},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3523828387260437},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.13918429613113403},{"id":"https://openalex.org/C170154142","wikidata":"https://www.wikidata.org/wiki/Q150737","display_name":"Architectural engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3664647.3681284","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3664647.3681284","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W2963966654","https://openalex.org/W2985068832","https://openalex.org/W3155072588","https://openalex.org/W3202536355","https://openalex.org/W4226014430","https://openalex.org/W4241614188","https://openalex.org/W4312497550","https://openalex.org/W4312824283","https://openalex.org/W4312933868","https://openalex.org/W4312956471","https://openalex.org/W4382457527","https://openalex.org/W4385270985","https://openalex.org/W4386066731","https://openalex.org/W4386072096","https://openalex.org/W4386076027","https://openalex.org/W4386076215","https://openalex.org/W4386083151","https://openalex.org/W4388187930","https://openalex.org/W4390872387","https://openalex.org/W4390872671","https://openalex.org/W4390889801"],"related_works":["https://openalex.org/W1996130883","https://openalex.org/W2748574964","https://openalex.org/W2888483922","https://openalex.org/W4396737233","https://openalex.org/W2367747139","https://openalex.org/W4391102217","https://openalex.org/W2566187525","https://openalex.org/W2566334511","https://openalex.org/W2367150592","https://openalex.org/W2378889330"],"abstract_inverted_index":{"Recent":[0],"text-to-image":[1],"(T2I)":[2],"synthesis":[3],"models":[4,19,35,40],"have":[5],"demonstrated":[6],"intriguing":[7],"abilities":[8],"to":[9,37,44,92,105,119],"produce":[10],"high-quality":[11],"images":[12,186],"based":[13],"on":[14,41,121,147],"text":[15],"prompts.":[16],"However,":[17],"current":[18],"still":[20],"face":[21],"Text-Image":[22],"Misalignment":[23],"problem":[24],"(e.g.,":[25],"attribute":[26],"errors":[27],"and":[28,80,87,141,150,167,172],"relation":[29],"mistakes)":[30],"for":[31,70,131,175],"compositional":[32],"generation.":[33],"Existing":[34],"attempted":[36],"condition":[38],"T2I":[39,71],"grounding":[42],"inputs":[43],"improve":[45,93],"controllability":[46],"while":[47],"ignoring":[48],"the":[49,53,94,102,117,122,153],"explicit":[50],"supervision":[51],"from":[52],"layout":[54,143],"conditions.":[55],"To":[56],"tackle":[57],"this":[58,90],"issue,":[59],"we":[60],"propose":[61],"Grounded":[62],"jOint":[63],"lAyout":[64],"aLignment":[65],"(GOAL),":[66],"an":[67],"effective":[68],"framework":[69,91],"synthesis.":[72],"Two":[73],"novel":[74],"modules,":[75],"discriminative":[76,99],"semantic":[77,108],"alignment":[78,83,114],"(DSAlign)":[79],"masked":[81],"attention":[82,113],"(MAAlign),":[84],"are":[85,162],"proposed":[86],"incorporated":[88],"in":[89,169],"text-image":[95],"alignment.":[96,109],"DSAlign":[97],"leverages":[98],"tasks":[100],"at":[101],"region-wise":[103],"level":[104],"ensure":[106],"low-level":[107],"MAAlign":[110],"provides":[111],"high-level":[112],"by":[115],"guiding":[116],"model":[118,132],"focus":[120],"target":[123],"object.":[124],"We":[125],"also":[126],"build":[127],"a":[128],"dataset":[129],"GOAL2K":[130],"fine-tuning,":[133],"which":[134],"composes":[135],"2000":[136],"semantically":[137],"accurate":[138],"image-text":[139],"pairs":[140],"their":[142],"annotations.":[144],"Comprehensive":[145],"evaluations":[146],"T2I-Compbench,":[148],"NSR-1K,":[149],"Drawbench":[151],"demonstrate":[152,180],"superior":[154],"generation":[155],"performance":[156],"of":[157,164,187],"our":[158,182],"method.":[159],"Especially,":[160],"there":[161],"improvements":[163],"19%,":[165],"13%,":[166],"12%":[168],"color,":[170],"shape,":[171],"texture":[173],"metrics":[174,179],"T2I-Compbench.":[176],"Additionally,":[177],"Q-Align":[178],"that":[181],"method":[183],"can":[184],"generate":[185],"higher":[188],"quality.":[189]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
