{"id":"https://openalex.org/W7141126420","doi":"https://doi.org/10.48550/arxiv.2603.25420","title":"VideoWeaver: Multimodal Multi-View Video-to-Video Transfer for Embodied Agents","display_name":"VideoWeaver: Multimodal Multi-View Video-to-Video Transfer for Embodied Agents","publication_year":2026,"publication_date":"2026-03-26","ids":{"openalex":"https://openalex.org/W7141126420","doi":"https://doi.org/10.48550/arxiv.2603.25420"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.25420","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25420","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.25420","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049254859","display_name":"George Eskandar","orcid":"https://orcid.org/0000-0002-8099-8717"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Eskandar, George","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052009653","display_name":"Fengyi Shen","orcid":"https://orcid.org/0000-0001-7621-9779"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Fengyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130758189","display_name":"Mohammad Altillawi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Altillawi, Mohammad","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130735574","display_name":"Dong Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Dong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130815535","display_name":"Yang Bai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yang, Liudi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Liudi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130764167","display_name":"Ziyuan Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ziyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5049254859"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.6646000146865845,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.6646000146865845,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.07729999721050262,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.04699999839067459,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robot","display_name":"Robot","score":0.6031000018119812},{"id":"https://openalex.org/keywords/embodied-cognition","display_name":"Embodied cognition","score":0.5964999794960022},{"id":"https://openalex.org/keywords/viewpoints","display_name":"Viewpoints","score":0.5230000019073486},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.48339998722076416},{"id":"https://openalex.org/keywords/a-priori-and-a-posteriori","display_name":"A priori and a posteriori","score":0.4075999855995178},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.3677000105381012},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.3626999855041504},{"id":"https://openalex.org/keywords/translation","display_name":"Translation (biology)","score":0.3529999852180481}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7376999855041504},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6065000295639038},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.6031000018119812},{"id":"https://openalex.org/C100609095","wikidata":"https://www.wikidata.org/wiki/Q1335050","display_name":"Embodied cognition","level":2,"score":0.5964999794960022},{"id":"https://openalex.org/C2776035091","wikidata":"https://www.wikidata.org/wiki/Q7928819","display_name":"Viewpoints","level":2,"score":0.5230000019073486},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.48339998722076416},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.4075999855995178},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.3677000105381012},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.3626999855041504},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.36070001125335693},{"id":"https://openalex.org/C149364088","wikidata":"https://www.wikidata.org/wiki/Q185917","display_name":"Translation (biology)","level":4,"score":0.3529999852180481},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3402000069618225},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3393000066280365},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.3107999861240387},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3059000074863434},{"id":"https://openalex.org/C192327766","wikidata":"https://www.wikidata.org/wiki/Q1038799","display_name":"Cognitive robotics","level":3,"score":0.29350000619888306},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.28839999437332153},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.287200003862381},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.2847999930381775},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.2809999883174896},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.2802000045776367},{"id":"https://openalex.org/C207347870","wikidata":"https://www.wikidata.org/wiki/Q371174","display_name":"Gesture","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C146849305","wikidata":"https://www.wikidata.org/wiki/Q370766","display_name":"Ground truth","level":2,"score":0.2696000039577484},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2531000077724457}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.25420","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25420","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.25420","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.25420","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"progress":[1],"in":[2,125,179],"video-to-video":[3],"(V2V)":[4],"translation":[5,99,203],"has":[6],"enabled":[7],"realistic":[8],"resimulation":[9],"of":[10,88,158,184],"embodied":[11,45],"AI":[12,46],"demonstrations,":[13],"a":[14,38,42,106,126,133,155],"capability":[15],"that":[16],"allows":[17,181],"pretrained":[18],"robot":[19,227],"policies":[20],"to":[21,24,55,64,68,80,84,115,121,170,197,223],"be":[22],"transferable":[23],"new":[25,185],"environments":[26],"without":[27],"additional":[28],"data":[29],"collection.":[30],"However,":[31],"prior":[32],"works":[33],"can":[34],"only":[35],"operate":[36],"on":[37,188,200],"single":[39],"view":[40,176],"at":[41,163],"time,":[43,209],"while":[44],"tasks":[47],"are":[48],"commonly":[49],"captured":[50],"from":[51,132],"multiple":[52],"synchronized":[53],"cameras":[54],"support":[56],"policy":[57],"learning.":[58,228],"Naively":[59],"applying":[60],"single-view":[61,107,202],"models":[62],"independently":[63],"each":[65],"camera":[66,150],"leads":[67],"inconsistent":[69],"appearance":[70,143],"across":[71],"views,":[72],"and":[73,148,174,211,219],"standard":[74],"transformer":[75],"architectures":[76],"do":[77],"not":[78],"scale":[79,153],"multi-view":[81,97,117,214],"settings":[82],"due":[83],"the":[85,94,116,168,198,201,207],"quadratic":[86],"cost":[87],"cross-view":[89],"attention.":[90],"We":[91],"present":[92],"VideoWeaver,":[93],"first":[95,208],"multimodal":[96],"V2V":[98,109],"framework.":[100],"VideoWeaver":[101],"is":[102],"initially":[103],"trained":[104],"as":[105],"flow-based":[108],"model.":[110],"To":[111,152],"achieve":[112],"an":[113],"extension":[114],"regime,":[118],"we":[119,160],"propose":[120],"ground":[122],"all":[123],"views":[124,162],"shared":[127],"4D":[128],"latent":[129],"space":[130],"derived":[131],"feed-forward":[134],"spatial":[135],"foundation":[136],"model,":[137],"namely,":[138],"Pi3.":[139],"This":[140,178],"encourages":[141],"view-consistent":[142],"even":[144],"under":[145],"wide":[146],"baselines":[147],"dynamic":[149],"motion.":[151],"beyond":[154],"fixed":[156],"number":[157],"cameras,":[159],"train":[161],"distinct":[164],"diffusion":[165],"timesteps,":[166],"enabling":[167],"model":[169],"learn":[171],"both":[172],"joint":[173],"conditional":[175],"distributions.":[177],"turn":[180],"autoregressive":[182],"synthesis":[183],"viewpoints":[186],"conditioned":[187],"existing":[189],"ones.":[190],"Experiments":[191],"show":[192],"superior":[193],"or":[194],"similar":[195],"performance":[196],"state-of-the-art":[199],"benchmarks":[204],"and,":[205],"for":[206,226],"physically":[210],"stylistically":[212],"consistent":[213],"translations,":[215],"including":[216],"challenging":[217],"egocentric":[218],"heterogeneous-camera":[220],"setups":[221],"central":[222],"world":[224],"randomization":[225]},"counts_by_year":[],"updated_date":"2026-05-04T08:30:34.212998","created_date":"2026-03-28T00:00:00"}
