{"id":"https://openalex.org/W4415535518","doi":"https://doi.org/10.1145/3746027.3755541","title":"From Outline to Detail: An Hierarchical End-to-end Framework for Coherent and Consistent Visual Novel Generation and Assembly","display_name":"From Outline to Detail: An Hierarchical End-to-end Framework for Coherent and Consistent Visual Novel Generation and Assembly","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415535518","doi":"https://doi.org/10.1145/3746027.3755541"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755541","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755541","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yilin Zhang","orcid":"https://orcid.org/0009-0008-9541-3348"},"institutions":[{"id":"https://openalex.org/I16365422","display_name":"Hefei University of Technology","ror":"https://ror.org/02czkny70","country_code":"CN","type":"education","lineage":["https://openalex.org/I16365422"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yilin Zhang","raw_affiliation_strings":["Hefei University of Technology, Hefei, China"],"affiliations":[{"raw_affiliation_string":"Hefei University of Technology, Hefei, China","institution_ids":["https://openalex.org/I16365422"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003728286","display_name":"Yanyan Wei","orcid":"https://orcid.org/0000-0001-8818-6740"},"institutions":[{"id":"https://openalex.org/I16365422","display_name":"Hefei University of Technology","ror":"https://ror.org/02czkny70","country_code":"CN","type":"education","lineage":["https://openalex.org/I16365422"]},{"id":"https://openalex.org/I39774598","display_name":"Hefei University","ror":"https://ror.org/01f5rdf64","country_code":"CN","type":"education","lineage":["https://openalex.org/I39774598"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanyan Wei","raw_affiliation_strings":["Hefei University of Technology, Hefei, China and Intelligent Interconnected Systems Laboratory of Anhui Province, Hefei, China"],"affiliations":[{"raw_affiliation_string":"Hefei University of Technology, Hefei, China and Intelligent Interconnected Systems Laboratory of Anhui Province, Hefei, China","institution_ids":["https://openalex.org/I16365422","https://openalex.org/I39774598"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100423029","display_name":"Zhao Zhang","orcid":"https://orcid.org/0000-0002-5703-7969"},"institutions":[{"id":"https://openalex.org/I16365422","display_name":"Hefei University of Technology","ror":"https://ror.org/02czkny70","country_code":"CN","type":"education","lineage":["https://openalex.org/I16365422"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhao Zhang","raw_affiliation_strings":["Hefei University of Technology, Hefei, China and Yunnan Key Laboratory of Software Engineering, Kunming, China"],"affiliations":[{"raw_affiliation_string":"Hefei University of Technology, Hefei, China and Yunnan Key Laboratory of Software Engineering, Kunming, China","institution_ids":["https://openalex.org/I16365422"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065547403","display_name":"Jicong Fan","orcid":"https://orcid.org/0000-0001-9665-0355"},"institutions":[{"id":"https://openalex.org/I4210116924","display_name":"Chinese University of Hong Kong, Shenzhen","ror":"https://ror.org/02d5ks197","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633","https://openalex.org/I180726961","https://openalex.org/I4210116924"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jicong Fan","raw_affiliation_strings":["The Chinese University of Hong Kong, Shenzhen, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong, Shenzhen, Shenzhen, China","institution_ids":["https://openalex.org/I4210116924"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100458457","display_name":"Haijun Zhang","orcid":"https://orcid.org/0000-0002-1648-0227"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haijun Zhang","raw_affiliation_strings":["Harbin Institute of Technology, Shenzhen, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Shenzhen, Shenzhen, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100381753","display_name":"Shuicheng Yan","orcid":"https://orcid.org/0000-0001-8906-3777"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Shuicheng Yan","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I16365422"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.15976552,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"8506","last_page":"8516"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11574","display_name":"Artificial Intelligence in Games","score":0.9941999912261963,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11574","display_name":"Artificial Intelligence in Games","score":0.9941999912261963,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10799","display_name":"Data Visualization and Analytics","score":0.9840999841690063,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9821000099182129,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/executable","display_name":"Executable","score":0.8736000061035156},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.7350000143051147},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5562000274658203},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.5235000252723694},{"id":"https://openalex.org/keywords/presentation","display_name":"Presentation (obstetrics)","score":0.43630000948905945},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.4212000072002411},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.3828999996185303},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3804999887943268}],"concepts":[{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.8736000061035156},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.745199978351593},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.7350000143051147},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5562000274658203},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.5235000252723694},{"id":"https://openalex.org/C2777601897","wikidata":"https://www.wikidata.org/wiki/Q3409113","display_name":"Presentation (obstetrics)","level":2,"score":0.43630000948905945},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.4212000072002411},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39910000562667847},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.39329999685287476},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.3828999996185303},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3804999887943268},{"id":"https://openalex.org/C50962388","wikidata":"https://www.wikidata.org/wiki/Q762018","display_name":"Invisibility","level":2,"score":0.33730000257492065},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.3206999897956848},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.3151000142097473},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3095000088214874},{"id":"https://openalex.org/C199033989","wikidata":"https://www.wikidata.org/wiki/Q1318295","display_name":"Narrative","level":2,"score":0.3059000074863434},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.28630000352859497},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.26249998807907104},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.2572999894618988},{"id":"https://openalex.org/C89505385","wikidata":"https://www.wikidata.org/wiki/Q47146","display_name":"User interface","level":2,"score":0.2549999952316284},{"id":"https://openalex.org/C167651023","wikidata":"https://www.wikidata.org/wiki/Q1474611","display_name":"Plot (graphics)","level":2,"score":0.2524999976158142},{"id":"https://openalex.org/C2778751112","wikidata":"https://www.wikidata.org/wiki/Q835016","display_name":"Window (computing)","level":2,"score":0.25040000677108765}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755541","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755541","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1048253250","display_name":null,"funder_award_id":"62472137, 62072151, U24A20331","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W2963341956","https://openalex.org/W3204365300","https://openalex.org/W4292779060","https://openalex.org/W4312933868","https://openalex.org/W4312964941","https://openalex.org/W4328124524","https://openalex.org/W4388019311","https://openalex.org/W4390872297","https://openalex.org/W4401042993","https://openalex.org/W4401943422","https://openalex.org/W4402508150","https://openalex.org/W4404985745"],"related_works":[],"abstract_inverted_index":{"As":[0],"a":[1,104,130,146],"form":[2],"of":[3,16,72,77,154],"multimedia":[4,138,173],"creation,":[5],"visual":[6],"novel":[7],"(VN)":[8],"conveys":[9],"engaging":[10],"narratives":[11],"through":[12],"the":[13,67,75,152,155],"integrated":[14],"presentation":[15],"text,":[17],"images,":[18],"and":[19,21,43,61,69,79,95,112,140,171],"music,":[20],"has":[22],"shown":[23],"promise":[24],"across":[25],"various":[26],"application":[27],"domains.":[28],"Recent":[29],"advances":[30],"in":[31,37],"generative":[32],"AI":[33],"have":[34],"fueled":[35],"interest":[36],"automating":[38],"VN":[39,50,110,157,166],"creation":[40,51],"using":[41],"LLMs":[42,73],"other":[44],"foundation":[45],"models.":[46],"However,":[47],"fully":[48],"end-to-end":[49,106],"(i.e.,":[52],"from":[53],"user":[54,124],"description":[55],"to":[56,150],"executable":[57,153],"VN)":[58],"remains":[59],"underexplored":[60],"presents":[62],"several":[63],"key":[64],"challenges:":[65],"1)":[66],"hallucination":[68],"limited":[70],"capacity":[71],"hinder":[74],"generation":[76,111,119],"long":[78],"coherent":[80,127,169],"plots;":[81],"2)":[82],"current":[83],"models":[84],"lack":[85],"effective":[86],"mechanisms":[87],"for":[88,108],"ensuring":[89],"cross-modal":[90],"consistency":[91,136],"between":[92,137],"plot,":[93],"visual,":[94],"audio":[96],"elements.":[97],"To":[98],"address":[99],"these":[100],"issues,":[101],"we":[102,144],"propose":[103],"hierarchical":[105],"framework":[107,163],"automatic":[109],"assembly,":[113],"which":[114],"employs":[115],"an":[116],"outline-guided":[117],"autoregressive":[118],"mechanism":[120,134,149],"that":[121,161],"transforms":[122],"high-level":[123],"prompts":[125],"into":[126],"plots,":[128],"while":[129],"vision":[131],"LLM-based":[132],"self-correction":[133],"ensures":[135],"assets":[139],"plot":[141],"content.":[142,174],"Additionally,":[143],"introduce":[145],"script":[147],"validation":[148],"ensure":[151],"final":[156],"application.":[158],"Experiments":[159],"demonstrate":[160],"our":[162],"generates":[164],"high-quality":[165],"applications":[167],"with":[168],"storylines":[170],"consistent":[172]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-25T00:00:00"}
