{"id":"https://openalex.org/W7151573710","doi":"https://doi.org/10.48550/arxiv.2604.03738","title":"Rethinking Position Embedding as a Context Controller for Multi-Reference and Multi-Shot Video Generation","display_name":"Rethinking Position Embedding as a Context Controller for Multi-Reference and Multi-Shot Video Generation","publication_year":2026,"publication_date":"2026-04-04","ids":{"openalex":"https://openalex.org/W7151573710","doi":"https://doi.org/10.48550/arxiv.2604.03738"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.03738","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03738","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.03738","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133091311","display_name":"Binyuan Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Huang, Binyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042248242","display_name":"Yuning Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Yuning","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009148506","display_name":"Weinan Jia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jia, Weinan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133134936","display_name":"Hualiang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Hualiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101815479","display_name":"Mu Liu","orcid":"https://orcid.org/0000-0001-6434-2884"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Mu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133130349","display_name":"Daiqing Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Daiqing","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5133091311"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.4260999858379364,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.4260999858379364,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.17890000343322754,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.10769999772310257,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6557000279426575},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6370999813079834},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.6069999933242798},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.5473999977111816},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5307000279426575},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.5278000235557556},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.4943999946117401},{"id":"https://openalex.org/keywords/position","display_name":"Position (finance)","score":0.45739999413490295}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.776199996471405},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6557000279426575},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6370999813079834},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.6069999933242798},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.5473999977111816},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5307000279426575},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.5278000235557556},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5063999891281128},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.4943999946117401},{"id":"https://openalex.org/C198082294","wikidata":"https://www.wikidata.org/wiki/Q3399648","display_name":"Position (finance)","level":2,"score":0.45739999413490295},{"id":"https://openalex.org/C203479927","wikidata":"https://www.wikidata.org/wiki/Q5165939","display_name":"Controller (irrigation)","level":2,"score":0.40389999747276306},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.39660000801086426},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.39500001072883606},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.3560999929904938},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.336899995803833},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.32120001316070557},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.3127000033855438},{"id":"https://openalex.org/C150189527","wikidata":"https://www.wikidata.org/wiki/Q356674","display_name":"Reference model","level":2,"score":0.3095000088214874},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.2971000075340271},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2644999921321869},{"id":"https://openalex.org/C37279795","wikidata":"https://www.wikidata.org/wiki/Q2492305","display_name":"Consistency model","level":3,"score":0.25529998540878296},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.03738","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03738","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.03738","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03738","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"proprietary":[1],"models":[2],"such":[3],"as":[4,70,78],"Sora2":[5],"demonstrate":[6,126],"promising":[7],"progress":[8],"in":[9],"generating":[10],"multi-shot":[11,110],"videos":[12],"conditioned":[13],"on":[14,21,103],"multiple":[15],"reference":[16,36,47,133],"characters.":[17],"However,":[18],"academic":[19],"research":[20],"this":[22,28],"problem":[23],"remains":[24],"limited.":[25],"We":[26],"study":[27],"task":[29],"and":[30,109,132],"identify":[31],"a":[32,71,107],"core":[33],"challenge:":[34],"when":[35],"images":[37],"exhibit":[38],"highly":[39],"similar":[40,51,121],"appearances,":[41],"the":[42,54,59],"model":[43,113],"often":[44],"suffers":[45],"from":[46],"confusion,":[48],"where":[49],"semantically":[50],"tokens":[52],"degrade":[53],"model's":[55],"ability":[56],"to":[57],"retrieve":[58],"correct":[60],"context.":[61],"To":[62],"address":[63],"this,":[64],"we":[65,105],"introduce":[66],"PoCo":[67,91,128],"(Position":[68],"Embedding":[69],"Context":[72],"Controller),":[73],"which":[74],"incorporates":[75],"position":[76],"encoding":[77],"additional":[79],"context":[80],"control":[81],"beyond":[82],"semantic":[83,99],"retrieval.":[84],"By":[85],"employing":[86],"side":[87],"information":[88],"of":[89,115],"tokens,":[90],"enables":[92],"precise":[93],"token-level":[94],"matching":[95],"while":[96],"preserving":[97],"implicit":[98],"consistency":[100,131],"modeling.":[101],"Building":[102],"PoCo,":[104],"develop":[106],"multi-reference":[108],"video":[111],"generation":[112],"capable":[114],"reliably":[116],"controlling":[117],"characters":[118],"with":[119,136],"extremely":[120],"visual":[122],"traits.":[123],"Extensive":[124],"experiments":[125],"that":[127],"improves":[129],"cross-shot":[130],"fidelity":[134],"compared":[135],"various":[137],"baselines.":[138]},"counts_by_year":[],"updated_date":"2026-04-08T06:07:18.267832","created_date":"2026-04-08T00:00:00"}
