{"id":"https://openalex.org/W4417484990","doi":"https://doi.org/10.1145/3769534.3769595","title":"Unspoken Details: Inferring Hidden Causality and Retrieving Domain-Specific Knowledge for Image Generation","display_name":"Unspoken Details: Inferring Hidden Causality and Retrieving Domain-Specific Knowledge for Image Generation","publication_year":2025,"publication_date":"2025-12-01","ids":{"openalex":"https://openalex.org/W4417484990","doi":"https://doi.org/10.1145/3769534.3769595"},"language":null,"primary_location":{"id":"doi:10.1145/3769534.3769595","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3769534.3769595","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 18th International Symposium on Visual Information Communication and Interaction","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3769534.3769595","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100717416","display_name":"Wen You","orcid":"https://orcid.org/0009-0003-1344-8162"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wen You","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"],"raw_orcid":"https://orcid.org/0009-0003-1344-8162","affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078204315","display_name":"Zhijun Ma","orcid":"https://orcid.org/0000-0002-8091-3760"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhijun Ma","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"],"raw_orcid":"https://orcid.org/0000-0002-8091-3760","affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080208009","display_name":"Zeteng Lin","orcid":"https://orcid.org/0000-0003-1026-3317"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeteng Lin","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"],"raw_orcid":"https://orcid.org/0000-0003-1026-3317","affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5114259627","display_name":"Tianzhao Lin","orcid":"https://orcid.org/0009-0002-3473-5155"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Troy TianYu Lin","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"],"raw_orcid":"https://orcid.org/0009-0002-3473-5155","affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5100717416"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.40414665,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8687000274658203,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8687000274658203,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.055399999022483826,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.006800000090152025,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5892000198364258},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.5888000130653381},{"id":"https://openalex.org/keywords/semantic-gap","display_name":"Semantic gap","score":0.5482000112533569},{"id":"https://openalex.org/keywords/causality","display_name":"Causality (physics)","score":0.5217000246047974},{"id":"https://openalex.org/keywords/interpretation","display_name":"Interpretation (philosophy)","score":0.4674000144004822},{"id":"https://openalex.org/keywords/knowledge-base","display_name":"Knowledge base","score":0.44110000133514404},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.3926999866962433},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.3727000057697296}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7289000153541565},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5892000198364258},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.5888000130653381},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.553600013256073},{"id":"https://openalex.org/C86034646","wikidata":"https://www.wikidata.org/wiki/Q474311","display_name":"Semantic gap","level":4,"score":0.5482000112533569},{"id":"https://openalex.org/C64357122","wikidata":"https://www.wikidata.org/wiki/Q1149766","display_name":"Causality (physics)","level":2,"score":0.5217000246047974},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5055000185966492},{"id":"https://openalex.org/C527412718","wikidata":"https://www.wikidata.org/wiki/Q855395","display_name":"Interpretation (philosophy)","level":2,"score":0.4674000144004822},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.44110000133514404},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.3926999866962433},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.3727000057697296},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3490999937057495},{"id":"https://openalex.org/C207685749","wikidata":"https://www.wikidata.org/wiki/Q2088941","display_name":"Domain knowledge","level":2,"score":0.3366999924182892},{"id":"https://openalex.org/C11671645","wikidata":"https://www.wikidata.org/wiki/Q5054567","display_name":"Causal model","level":2,"score":0.3222000002861023},{"id":"https://openalex.org/C198942812","wikidata":"https://www.wikidata.org/wiki/Q496618","display_name":"Semantic property","level":2,"score":0.28130000829696655},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C193125573","wikidata":"https://www.wikidata.org/wiki/Q7449065","display_name":"Semantic interpretation","level":2,"score":0.2700999975204468},{"id":"https://openalex.org/C115925183","wikidata":"https://www.wikidata.org/wiki/Q1412694","display_name":"Knowledge-based systems","level":2,"score":0.26899999380111694},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2628999948501587},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.259799987077713},{"id":"https://openalex.org/C75291252","wikidata":"https://www.wikidata.org/wiki/Q1315756","display_name":"TRACE (psycholinguistics)","level":2,"score":0.2572000026702881},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.251800000667572},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3769534.3769595","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3769534.3769595","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 18th International Symposium on Visual Information Communication and Interaction","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3769534.3769595","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3769534.3769595","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 18th International Symposium on Visual Information Communication and Interaction","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W1658156241","https://openalex.org/W2998702515","https://openalex.org/W4312933868","https://openalex.org/W4386076027","https://openalex.org/W4387827820","https://openalex.org/W4390871671","https://openalex.org/W4390873054","https://openalex.org/W4393147272","https://openalex.org/W4402727668","https://openalex.org/W4404724487","https://openalex.org/W4408954419","https://openalex.org/W4409078647","https://openalex.org/W4409182016","https://openalex.org/W4412474742"],"related_works":[],"abstract_inverted_index":{"Text-to-image":[0],"(T2I)":[1],"generation":[2],"has":[3],"advanced":[4],"significantly":[5],"in":[6,155],"recent":[7],"years,":[8],"yet":[9],"current":[10],"models":[11,61],"often":[12],"struggle":[13],"with":[14],"prompts":[15,78],"that":[16,57,192],"imply":[17],"causal":[18,83,129],"sequences":[19],"or":[20,157],"require":[21],"knowledge":[22,97],"of":[23,45,82,133,167,178],"culturally":[24,158],"grounded":[25],"entities.":[26,87],"This":[27,143],"limitation":[28],"stems":[29],"from":[30,94],"a":[31,36,53,95,114,123,138],"fundamental":[32],"\"semantic":[33],"gap\"":[34],"between":[35],"user\u2019s":[37],"rich":[38],"intent":[39],"and":[40,65,85,130,150,173,181,189],"the":[41,128,134,164,175,186],"model\u2019s":[42],"statistical":[43],"interpretation":[44],"text.":[46],"To":[47],"address":[48],"these":[49,101],"limitations,":[50],"we":[51],"propose":[52],"causality-aware":[54],"multimodal":[55,96,111],"framework":[56],"integrates":[58],"large":[59],"language":[60],"(LLMs),":[62],"visual-language":[63],"verification,":[64],"domain-specific":[66],"image":[67,152],"retrieval":[68],"within":[69],"an":[70,109],"iterative,":[71],"self-correcting":[72],"pipeline.":[73],"The":[74],"system":[75],"first":[76],"decomposes":[77],"into":[79,108],"structured":[80],"representations":[81],"chains":[84],"named":[86],"It":[88],"then":[89],"retrieves":[90],"aligned":[91],"visual":[92],"references":[93],"base":[98],"to":[99,196],"ground":[100],"abstract":[102],"concepts.":[103],"These":[104],"components":[105],"are":[106,193],"fused":[107],"enriched":[110],"prompt":[112],"for":[113],"frozen-backbone":[115],"diffusion":[116],"model.":[117],"A":[118],"verification":[119],"module,":[120],"powered":[121],"by":[122,170],"Vision-Language":[124],"Model":[125],"(VLM),":[126],"evaluates":[127],"semantic":[131,187],"consistency":[132],"generated":[135],"output,":[136],"triggering":[137],"refinement":[139],"loop":[140],"when":[141],"necessary.":[142],"closed-loop":[144],"design":[145],"enables":[146],"more":[147,194],"coherent,":[148],"grounded,":[149],"context-sensitive":[151],"synthesis,":[153],"particularly":[154],"complex":[156],"nuanced":[159],"scenarios.":[160],"Our":[161],"approach":[162],"expands":[163],"expressive":[165],"capacity":[166],"T2I":[168],"systems":[169],"explicitly":[171],"modeling":[172],"integrating":[174],"unspoken":[176],"details":[177],"physical":[179],"logic":[180],"domain":[182],"knowledge,":[183],"thereby":[184],"bridging":[185],"gap":[188],"producing":[190],"images":[191],"faithful":[195],"user":[197],"intent.":[198]},"counts_by_year":[],"updated_date":"2025-12-21T01:58:51.020947","created_date":"2025-12-19T00:00:00"}
