{"id":"https://openalex.org/W7155036764","doi":"https://doi.org/10.48550/arxiv.2604.18575","title":"ReCap: Lightweight Referential Grounding for Coherent Story Visualization","display_name":"ReCap: Lightweight Referential Grounding for Coherent Story Visualization","publication_year":2026,"publication_date":"2026-04-20","ids":{"openalex":"https://openalex.org/W7155036764","doi":"https://doi.org/10.48550/arxiv.2604.18575"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.18575","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.18575","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.18575","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5085736262","display_name":"Aditya Arora","orcid":"https://orcid.org/0000-0002-6010-3912"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Arora, Aditya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134149699","display_name":"Akshita Gupta","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gupta, Akshita","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103191934","display_name":"Pau Rodr\u00edguez","orcid":"https://orcid.org/0000-0002-1689-8084"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rodriguez, Pau","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134127129","display_name":"Marcus Rohrbach","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rohrbach, Marcus","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5085736262"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10799","display_name":"Data Visualization and Analytics","score":0.32350000739097595,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10799","display_name":"Data Visualization and Analytics","score":0.32350000739097595,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.263700008392334,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.20589999854564667,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.7095999717712402},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5641999840736389},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4997999966144562},{"id":"https://openalex.org/keywords/narrative","display_name":"Narrative","score":0.45910000801086426},{"id":"https://openalex.org/keywords/character","display_name":"Character (mathematics)","score":0.45350000262260437},{"id":"https://openalex.org/keywords/stylized-fact","display_name":"Stylized fact","score":0.4203000068664551},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.4156999886035919},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.3894999921321869}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7110000252723694},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.7095999717712402},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5641999840736389},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5166000127792358},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4997999966144562},{"id":"https://openalex.org/C199033989","wikidata":"https://www.wikidata.org/wiki/Q1318295","display_name":"Narrative","level":2,"score":0.45910000801086426},{"id":"https://openalex.org/C2780861071","wikidata":"https://www.wikidata.org/wiki/Q1062934","display_name":"Character (mathematics)","level":2,"score":0.45350000262260437},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.44609999656677246},{"id":"https://openalex.org/C38935604","wikidata":"https://www.wikidata.org/wiki/Q4330363","display_name":"Stylized fact","level":2,"score":0.4203000068664551},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.4156999886035919},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3894999921321869},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.3375000059604645},{"id":"https://openalex.org/C142816647","wikidata":"https://www.wikidata.org/wiki/Q5573018","display_name":"Glyph (data visualization)","level":3,"score":0.32690000534057617},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.31929999589920044},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.30550000071525574},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.301800012588501},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2957000136375427},{"id":"https://openalex.org/C185578843","wikidata":"https://www.wikidata.org/wiki/Q10609775","display_name":"Information visualization","level":3,"score":0.2919999957084656},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.28949999809265137},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.28600001335144043},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.28119999170303345},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.2757999897003174}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.18575","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.18575","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.18575","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.18575","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Story":[0],"Visualization":[1],"aims":[2],"to":[3,94,104,164,225],"generate":[4],"a":[5,12,56,96,121,155,212],"sequence":[6],"of":[7,123,126,235],"images":[8],"that":[9,15,60],"faithfully":[10],"depicts":[11],"textual":[13],"narrative":[14],"preserve":[16],"character":[17,62,162,215],"identity,":[18,134],"spatial":[19],"configuration,":[20],"and":[21,50,64,98,115,128,206],"stylistic":[22],"coherence":[23],"as":[24,85],"the":[25,69,101,124,152,194,233],"narratives":[26,227],"unfold.":[27],"Maintaining":[28],"such":[29],"cross-frame":[30,113],"consistency":[31,58,216],"has":[32],"traditionally":[33],"relied":[34],"on":[35,100,193,204,209,217],"explicit":[36],"memory":[37],"banks,":[38],"architectural":[39],"expansion,":[40],"or":[41,150],"auxiliary":[42],"language":[43],"models,":[44],"resulting":[45],"in":[46,81],"substantial":[47],"parameter":[48],"growth":[49],"inference":[51,186],"overhead.":[52],"We":[53],"introduce":[54],"ReCap,":[55],"lightweight":[57],"framework":[59],"improves":[61],"stability":[63,183],"visual":[65,86,106,156,178],"fidelity":[66],"without":[67],"modifying":[68],"base":[70],"diffusion":[71],"backbone.":[72],"ReCap's":[73],"CORE":[74],"(COnditional":[75],"frame":[76,103],"REferencing)":[77],"module":[78],"treats":[79],"anaphors,":[80],"our":[82],"case":[83],"pronouns,":[84],"anchors,":[87],"activating":[88],"only":[89,117,143],"when":[90],"characters":[91],"are":[92],"referred":[93],"by":[95,171,201,207],"pronoun":[97],"conditioning":[99,114],"preceding":[102],"propagate":[105],"identity.":[107],"This":[108],"selective":[109],"design":[110],"avoids":[111],"unconditional":[112],"introduces":[116],"149K":[118],"additional":[119],"parameters,":[120],"fraction":[122],"cost":[125],"memory-bank":[127],"LLM-augmented":[129],"approaches.":[130],"To":[131],"further":[132],"stabilize":[133],"we":[135,221],"incorporate":[136],"SemDrift":[137,168],"(Guided":[138],"Semantic":[139],"Drift":[140],"Correction)":[141],"applied":[142],"during":[144],"training.":[145],"When":[146],"text":[147],"is":[148],"vague":[149],"referential,":[151],"denoiser":[153,173],"lacks":[154],"anchor":[157],"for":[158,198],"identity-defining":[159],"attributes,":[160],"causing":[161],"appearance":[163],"drift":[165],"across":[166],"frames,":[167],"corrects":[169],"this":[170],"aligning":[172],"representations":[174],"with":[175],"pretrained":[176],"DINOv3":[177],"embeddings,":[179],"enforcing":[180],"semantic":[181],"identity":[182],"at":[184],"zero":[185],"cost.":[187],"ReCap":[188,236],"outperforms":[189],"previous":[190],"state-of-the-art,":[191],"StoryGPT-V,":[192],"two":[195],"main":[196],"benchmarks":[197],"story":[199,223],"visualization":[200,224],"2.63%":[202],"Character-Accuracy":[203],"FlintstonesSV":[205],"5.65%":[208],"PororoSV,":[210],"establishing":[211],"new":[213],"state-of-the-art":[214],"both":[218],"benchmarks.":[219],"Furthermore,":[220],"extend":[222],"human-centric":[226],"derived":[228],"from":[229],"real":[230],"films,":[231],"demonstrating":[232],"capability":[234],"beyond":[237],"stylized":[238],"cartoon":[239],"domains.":[240]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-04-22T00:00:00"}
